1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanHCFGTransforms.h" 62 #include "VPlanPredicator.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpander.h" 95 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 96 #include "llvm/Analysis/TargetLibraryInfo.h" 97 #include "llvm/Analysis/TargetTransformInfo.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfoMetadata.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/LLVMContext.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/LoopSimplify.h" 137 #include "llvm/Transforms/Utils/LoopUtils.h" 138 #include "llvm/Transforms/Utils/LoopVersioning.h" 139 #include "llvm/Transforms/Utils/SizeOpts.h" 140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 141 #include <algorithm> 142 #include <cassert> 143 #include <cstdint> 144 #include <cstdlib> 145 #include <functional> 146 #include <iterator> 147 #include <limits> 148 #include <memory> 149 #include <string> 150 #include <tuple> 151 #include <utility> 152 #include <vector> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 /// @{ 160 /// Metadata attribute names 161 static const char *const LLVMLoopVectorizeFollowupAll = 162 "llvm.loop.vectorize.followup_all"; 163 static const char *const LLVMLoopVectorizeFollowupVectorized = 164 "llvm.loop.vectorize.followup_vectorized"; 165 static const char *const LLVMLoopVectorizeFollowupEpilogue = 166 "llvm.loop.vectorize.followup_epilogue"; 167 /// @} 168 169 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 171 172 /// Loops with a known constant trip count below this number are vectorized only 173 /// if no scalar iteration overheads are incurred. 174 static cl::opt<unsigned> TinyTripCountVectorThreshold( 175 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 176 cl::desc("Loops with a constant trip count that is smaller than this " 177 "value are vectorized only if no scalar iteration overheads " 178 "are incurred.")); 179 180 // Indicates that an epilogue is undesired, predication is preferred. 181 // This means that the vectorizer will try to fold the loop-tail (epilogue) 182 // into the loop and predicate the loop body accordingly. 183 static cl::opt<bool> PreferPredicateOverEpilog( 184 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 185 cl::desc("Indicate that an epilogue is undesired, predication should be " 186 "used instead.")); 187 188 static cl::opt<bool> MaximizeBandwidth( 189 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 190 cl::desc("Maximize bandwidth when selecting vectorization factor which " 191 "will be determined by the smallest type in loop.")); 192 193 static cl::opt<bool> EnableInterleavedMemAccesses( 194 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 195 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 196 197 /// An interleave-group may need masking if it resides in a block that needs 198 /// predication, or in order to mask away gaps. 199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 200 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 201 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 202 203 /// We don't interleave loops with a known constant trip count below this 204 /// number. 205 static const unsigned TinyTripCountInterleaveThreshold = 128; 206 207 static cl::opt<unsigned> ForceTargetNumScalarRegs( 208 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 209 cl::desc("A flag that overrides the target's number of scalar registers.")); 210 211 static cl::opt<unsigned> ForceTargetNumVectorRegs( 212 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 213 cl::desc("A flag that overrides the target's number of vector registers.")); 214 215 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 216 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 217 cl::desc("A flag that overrides the target's max interleave factor for " 218 "scalar loops.")); 219 220 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 221 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 222 cl::desc("A flag that overrides the target's max interleave factor for " 223 "vectorized loops.")); 224 225 static cl::opt<unsigned> ForceTargetInstructionCost( 226 "force-target-instruction-cost", cl::init(0), cl::Hidden, 227 cl::desc("A flag that overrides the target's expected cost for " 228 "an instruction to a single constant value. Mostly " 229 "useful for getting consistent testing.")); 230 231 static cl::opt<unsigned> SmallLoopCost( 232 "small-loop-cost", cl::init(20), cl::Hidden, 233 cl::desc( 234 "The cost of a loop that is considered 'small' by the interleaver.")); 235 236 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 237 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 238 cl::desc("Enable the use of the block frequency analysis to access PGO " 239 "heuristics minimizing code growth in cold regions and being more " 240 "aggressive in hot regions.")); 241 242 // Runtime interleave loops for load/store throughput. 243 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 244 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 245 cl::desc( 246 "Enable runtime interleaving until load/store ports are saturated")); 247 248 /// The number of stores in a loop that are allowed to need predication. 249 static cl::opt<unsigned> NumberOfStoresToPredicate( 250 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 251 cl::desc("Max number of stores to be predicated behind an if.")); 252 253 static cl::opt<bool> EnableIndVarRegisterHeur( 254 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 255 cl::desc("Count the induction variable only once when interleaving")); 256 257 static cl::opt<bool> EnableCondStoresVectorization( 258 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 259 cl::desc("Enable if predication of stores during vectorization.")); 260 261 static cl::opt<unsigned> MaxNestedScalarReductionIC( 262 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 263 cl::desc("The maximum interleave count to use when interleaving a scalar " 264 "reduction in a nested loop.")); 265 266 cl::opt<bool> EnableVPlanNativePath( 267 "enable-vplan-native-path", cl::init(false), cl::Hidden, 268 cl::desc("Enable VPlan-native vectorization path with " 269 "support for outer loop vectorization.")); 270 271 // FIXME: Remove this switch once we have divergence analysis. Currently we 272 // assume divergent non-backedge branches when this switch is true. 273 cl::opt<bool> EnableVPlanPredication( 274 "enable-vplan-predication", cl::init(false), cl::Hidden, 275 cl::desc("Enable VPlan-native vectorization path predicator with " 276 "support for outer loop vectorization.")); 277 278 // This flag enables the stress testing of the VPlan H-CFG construction in the 279 // VPlan-native vectorization path. It must be used in conjuction with 280 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 281 // verification of the H-CFGs built. 282 static cl::opt<bool> VPlanBuildStressTest( 283 "vplan-build-stress-test", cl::init(false), cl::Hidden, 284 cl::desc( 285 "Build VPlan for every supported loop nest in the function and bail " 286 "out right after the build (stress test the VPlan H-CFG construction " 287 "in the VPlan-native vectorization path).")); 288 289 cl::opt<bool> llvm::EnableLoopInterleaving( 290 "interleave-loops", cl::init(true), cl::Hidden, 291 cl::desc("Enable loop interleaving in Loop vectorization passes")); 292 cl::opt<bool> llvm::EnableLoopVectorization( 293 "vectorize-loops", cl::init(true), cl::Hidden, 294 cl::desc("Run the Loop vectorization passes")); 295 296 /// A helper function for converting Scalar types to vector types. 297 /// If the incoming type is void, we return void. If the VF is 1, we return 298 /// the scalar type. 299 static Type *ToVectorTy(Type *Scalar, unsigned VF) { 300 if (Scalar->isVoidTy() || VF == 1) 301 return Scalar; 302 return VectorType::get(Scalar, VF); 303 } 304 305 /// A helper function that returns the type of loaded or stored value. 306 static Type *getMemInstValueType(Value *I) { 307 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 308 "Expected Load or Store instruction"); 309 if (auto *LI = dyn_cast<LoadInst>(I)) 310 return LI->getType(); 311 return cast<StoreInst>(I)->getValueOperand()->getType(); 312 } 313 314 /// A helper function that returns true if the given type is irregular. The 315 /// type is irregular if its allocated size doesn't equal the store size of an 316 /// element of the corresponding vector type at the given vectorization factor. 317 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 318 // Determine if an array of VF elements of type Ty is "bitcast compatible" 319 // with a <VF x Ty> vector. 320 if (VF > 1) { 321 auto *VectorTy = VectorType::get(Ty, VF); 322 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 323 } 324 325 // If the vectorization factor is one, we just check if an array of type Ty 326 // requires padding between elements. 327 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 328 } 329 330 /// A helper function that returns the reciprocal of the block probability of 331 /// predicated blocks. If we return X, we are assuming the predicated block 332 /// will execute once for every X iterations of the loop header. 333 /// 334 /// TODO: We should use actual block probability here, if available. Currently, 335 /// we always assume predicated blocks have a 50% chance of executing. 336 static unsigned getReciprocalPredBlockProb() { return 2; } 337 338 /// A helper function that adds a 'fast' flag to floating-point operations. 339 static Value *addFastMathFlag(Value *V) { 340 if (isa<FPMathOperator>(V)) 341 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 342 return V; 343 } 344 345 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 346 if (isa<FPMathOperator>(V)) 347 cast<Instruction>(V)->setFastMathFlags(FMF); 348 return V; 349 } 350 351 /// A helper function that returns an integer or floating-point constant with 352 /// value C. 353 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 354 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 355 : ConstantFP::get(Ty, C); 356 } 357 358 namespace llvm { 359 360 /// InnerLoopVectorizer vectorizes loops which contain only one basic 361 /// block to a specified vectorization factor (VF). 362 /// This class performs the widening of scalars into vectors, or multiple 363 /// scalars. This class also implements the following features: 364 /// * It inserts an epilogue loop for handling loops that don't have iteration 365 /// counts that are known to be a multiple of the vectorization factor. 366 /// * It handles the code generation for reduction variables. 367 /// * Scalarization (implementation using scalars) of un-vectorizable 368 /// instructions. 369 /// InnerLoopVectorizer does not perform any vectorization-legality 370 /// checks, and relies on the caller to check for the different legality 371 /// aspects. The InnerLoopVectorizer relies on the 372 /// LoopVectorizationLegality class to provide information about the induction 373 /// and reduction variables that were found to a given vectorization factor. 374 class InnerLoopVectorizer { 375 public: 376 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 377 LoopInfo *LI, DominatorTree *DT, 378 const TargetLibraryInfo *TLI, 379 const TargetTransformInfo *TTI, AssumptionCache *AC, 380 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 381 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 382 LoopVectorizationCostModel *CM) 383 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 384 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 385 Builder(PSE.getSE()->getContext()), 386 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 387 virtual ~InnerLoopVectorizer() = default; 388 389 /// Create a new empty loop. Unlink the old loop and connect the new one. 390 /// Return the pre-header block of the new loop. 391 BasicBlock *createVectorizedLoopSkeleton(); 392 393 /// Widen a single instruction within the innermost loop. 394 void widenInstruction(Instruction &I); 395 396 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 397 void fixVectorizedLoop(); 398 399 // Return true if any runtime check is added. 400 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 401 402 /// A type for vectorized values in the new loop. Each value from the 403 /// original loop, when vectorized, is represented by UF vector values in the 404 /// new unrolled loop, where UF is the unroll factor. 405 using VectorParts = SmallVector<Value *, 2>; 406 407 /// Vectorize a single PHINode in a block. This method handles the induction 408 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 409 /// arbitrary length vectors. 410 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 411 412 /// A helper function to scalarize a single Instruction in the innermost loop. 413 /// Generates a sequence of scalar instances for each lane between \p MinLane 414 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 415 /// inclusive.. 416 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, 417 bool IfPredicateInstr); 418 419 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 420 /// is provided, the integer induction variable will first be truncated to 421 /// the corresponding type. 422 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 423 424 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 425 /// vector or scalar value on-demand if one is not yet available. When 426 /// vectorizing a loop, we visit the definition of an instruction before its 427 /// uses. When visiting the definition, we either vectorize or scalarize the 428 /// instruction, creating an entry for it in the corresponding map. (In some 429 /// cases, such as induction variables, we will create both vector and scalar 430 /// entries.) Then, as we encounter uses of the definition, we derive values 431 /// for each scalar or vector use unless such a value is already available. 432 /// For example, if we scalarize a definition and one of its uses is vector, 433 /// we build the required vector on-demand with an insertelement sequence 434 /// when visiting the use. Otherwise, if the use is scalar, we can use the 435 /// existing scalar definition. 436 /// 437 /// Return a value in the new loop corresponding to \p V from the original 438 /// loop at unroll index \p Part. If the value has already been vectorized, 439 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 440 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 441 /// a new vector value on-demand by inserting the scalar values into a vector 442 /// with an insertelement sequence. If the value has been neither vectorized 443 /// nor scalarized, it must be loop invariant, so we simply broadcast the 444 /// value into a vector. 445 Value *getOrCreateVectorValue(Value *V, unsigned Part); 446 447 /// Return a value in the new loop corresponding to \p V from the original 448 /// loop at unroll and vector indices \p Instance. If the value has been 449 /// vectorized but not scalarized, the necessary extractelement instruction 450 /// will be generated. 451 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 452 453 /// Construct the vector value of a scalarized value \p V one lane at a time. 454 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 455 456 /// Try to vectorize the interleaved access group that \p Instr belongs to, 457 /// optionally masking the vector operations if \p BlockInMask is non-null. 458 void vectorizeInterleaveGroup(Instruction *Instr, 459 VectorParts *BlockInMask = nullptr); 460 461 /// Vectorize Load and Store instructions, optionally masking the vector 462 /// operations if \p BlockInMask is non-null. 463 void vectorizeMemoryInstruction(Instruction *Instr, 464 VectorParts *BlockInMask = nullptr); 465 466 /// Set the debug location in the builder using the debug location in 467 /// the instruction. 468 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 469 470 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 471 void fixNonInductionPHIs(void); 472 473 protected: 474 friend class LoopVectorizationPlanner; 475 476 /// A small list of PHINodes. 477 using PhiVector = SmallVector<PHINode *, 4>; 478 479 /// A type for scalarized values in the new loop. Each value from the 480 /// original loop, when scalarized, is represented by UF x VF scalar values 481 /// in the new unrolled loop, where UF is the unroll factor and VF is the 482 /// vectorization factor. 483 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 484 485 /// Set up the values of the IVs correctly when exiting the vector loop. 486 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 487 Value *CountRoundDown, Value *EndValue, 488 BasicBlock *MiddleBlock); 489 490 /// Create a new induction variable inside L. 491 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 492 Value *Step, Instruction *DL); 493 494 /// Handle all cross-iteration phis in the header. 495 void fixCrossIterationPHIs(); 496 497 /// Fix a first-order recurrence. This is the second phase of vectorizing 498 /// this phi node. 499 void fixFirstOrderRecurrence(PHINode *Phi); 500 501 /// Fix a reduction cross-iteration phi. This is the second phase of 502 /// vectorizing this phi node. 503 void fixReduction(PHINode *Phi); 504 505 /// The Loop exit block may have single value PHI nodes with some 506 /// incoming value. While vectorizing we only handled real values 507 /// that were defined inside the loop and we should have one value for 508 /// each predecessor of its parent basic block. See PR14725. 509 void fixLCSSAPHIs(); 510 511 /// Iteratively sink the scalarized operands of a predicated instruction into 512 /// the block that was created for it. 513 void sinkScalarOperands(Instruction *PredInst); 514 515 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 516 /// represented as. 517 void truncateToMinimalBitwidths(); 518 519 /// Insert the new loop to the loop hierarchy and pass manager 520 /// and update the analysis passes. 521 void updateAnalysis(); 522 523 /// Create a broadcast instruction. This method generates a broadcast 524 /// instruction (shuffle) for loop invariant values and for the induction 525 /// value. If this is the induction variable then we extend it to N, N+1, ... 526 /// this is needed because each iteration in the loop corresponds to a SIMD 527 /// element. 528 virtual Value *getBroadcastInstrs(Value *V); 529 530 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 531 /// to each vector element of Val. The sequence starts at StartIndex. 532 /// \p Opcode is relevant for FP induction variable. 533 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 534 Instruction::BinaryOps Opcode = 535 Instruction::BinaryOpsEnd); 536 537 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 538 /// variable on which to base the steps, \p Step is the size of the step, and 539 /// \p EntryVal is the value from the original loop that maps to the steps. 540 /// Note that \p EntryVal doesn't have to be an induction variable - it 541 /// can also be a truncate instruction. 542 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 543 const InductionDescriptor &ID); 544 545 /// Create a vector induction phi node based on an existing scalar one. \p 546 /// EntryVal is the value from the original loop that maps to the vector phi 547 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 548 /// truncate instruction, instead of widening the original IV, we widen a 549 /// version of the IV truncated to \p EntryVal's type. 550 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 551 Value *Step, Instruction *EntryVal); 552 553 /// Returns true if an instruction \p I should be scalarized instead of 554 /// vectorized for the chosen vectorization factor. 555 bool shouldScalarizeInstruction(Instruction *I) const; 556 557 /// Returns true if we should generate a scalar version of \p IV. 558 bool needsScalarInduction(Instruction *IV) const; 559 560 /// If there is a cast involved in the induction variable \p ID, which should 561 /// be ignored in the vectorized loop body, this function records the 562 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 563 /// cast. We had already proved that the casted Phi is equal to the uncasted 564 /// Phi in the vectorized loop (under a runtime guard), and therefore 565 /// there is no need to vectorize the cast - the same value can be used in the 566 /// vector loop for both the Phi and the cast. 567 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 568 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 569 /// 570 /// \p EntryVal is the value from the original loop that maps to the vector 571 /// phi node and is used to distinguish what is the IV currently being 572 /// processed - original one (if \p EntryVal is a phi corresponding to the 573 /// original IV) or the "newly-created" one based on the proof mentioned above 574 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 575 /// latter case \p EntryVal is a TruncInst and we must not record anything for 576 /// that IV, but it's error-prone to expect callers of this routine to care 577 /// about that, hence this explicit parameter. 578 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 579 const Instruction *EntryVal, 580 Value *VectorLoopValue, 581 unsigned Part, 582 unsigned Lane = UINT_MAX); 583 584 /// Generate a shuffle sequence that will reverse the vector Vec. 585 virtual Value *reverseVector(Value *Vec); 586 587 /// Returns (and creates if needed) the original loop trip count. 588 Value *getOrCreateTripCount(Loop *NewLoop); 589 590 /// Returns (and creates if needed) the trip count of the widened loop. 591 Value *getOrCreateVectorTripCount(Loop *NewLoop); 592 593 /// Returns a bitcasted value to the requested vector type. 594 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 595 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 596 const DataLayout &DL); 597 598 /// Emit a bypass check to see if the vector trip count is zero, including if 599 /// it overflows. 600 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 601 602 /// Emit a bypass check to see if all of the SCEV assumptions we've 603 /// had to make are correct. 604 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 605 606 /// Emit bypass checks to check any memory assumptions we may have made. 607 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 608 609 /// Compute the transformed value of Index at offset StartValue using step 610 /// StepValue. 611 /// For integer induction, returns StartValue + Index * StepValue. 612 /// For pointer induction, returns StartValue[Index * StepValue]. 613 /// FIXME: The newly created binary instructions should contain nsw/nuw 614 /// flags, which can be found from the original scalar operations. 615 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 616 const DataLayout &DL, 617 const InductionDescriptor &ID) const; 618 619 /// Add additional metadata to \p To that was not present on \p Orig. 620 /// 621 /// Currently this is used to add the noalias annotations based on the 622 /// inserted memchecks. Use this for instructions that are *cloned* into the 623 /// vector loop. 624 void addNewMetadata(Instruction *To, const Instruction *Orig); 625 626 /// Add metadata from one instruction to another. 627 /// 628 /// This includes both the original MDs from \p From and additional ones (\see 629 /// addNewMetadata). Use this for *newly created* instructions in the vector 630 /// loop. 631 void addMetadata(Instruction *To, Instruction *From); 632 633 /// Similar to the previous function but it adds the metadata to a 634 /// vector of instructions. 635 void addMetadata(ArrayRef<Value *> To, Instruction *From); 636 637 /// The original loop. 638 Loop *OrigLoop; 639 640 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 641 /// dynamic knowledge to simplify SCEV expressions and converts them to a 642 /// more usable form. 643 PredicatedScalarEvolution &PSE; 644 645 /// Loop Info. 646 LoopInfo *LI; 647 648 /// Dominator Tree. 649 DominatorTree *DT; 650 651 /// Alias Analysis. 652 AliasAnalysis *AA; 653 654 /// Target Library Info. 655 const TargetLibraryInfo *TLI; 656 657 /// Target Transform Info. 658 const TargetTransformInfo *TTI; 659 660 /// Assumption Cache. 661 AssumptionCache *AC; 662 663 /// Interface to emit optimization remarks. 664 OptimizationRemarkEmitter *ORE; 665 666 /// LoopVersioning. It's only set up (non-null) if memchecks were 667 /// used. 668 /// 669 /// This is currently only used to add no-alias metadata based on the 670 /// memchecks. The actually versioning is performed manually. 671 std::unique_ptr<LoopVersioning> LVer; 672 673 /// The vectorization SIMD factor to use. Each vector will have this many 674 /// vector elements. 675 unsigned VF; 676 677 /// The vectorization unroll factor to use. Each scalar is vectorized to this 678 /// many different vector instructions. 679 unsigned UF; 680 681 /// The builder that we use 682 IRBuilder<> Builder; 683 684 // --- Vectorization state --- 685 686 /// The vector-loop preheader. 687 BasicBlock *LoopVectorPreHeader; 688 689 /// The scalar-loop preheader. 690 BasicBlock *LoopScalarPreHeader; 691 692 /// Middle Block between the vector and the scalar. 693 BasicBlock *LoopMiddleBlock; 694 695 /// The ExitBlock of the scalar loop. 696 BasicBlock *LoopExitBlock; 697 698 /// The vector loop body. 699 BasicBlock *LoopVectorBody; 700 701 /// The scalar loop body. 702 BasicBlock *LoopScalarBody; 703 704 /// A list of all bypass blocks. The first block is the entry of the loop. 705 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 706 707 /// The new Induction variable which was added to the new block. 708 PHINode *Induction = nullptr; 709 710 /// The induction variable of the old basic block. 711 PHINode *OldInduction = nullptr; 712 713 /// Maps values from the original loop to their corresponding values in the 714 /// vectorized loop. A key value can map to either vector values, scalar 715 /// values or both kinds of values, depending on whether the key was 716 /// vectorized and scalarized. 717 VectorizerValueMap VectorLoopValueMap; 718 719 /// Store instructions that were predicated. 720 SmallVector<Instruction *, 4> PredicatedInstructions; 721 722 /// Trip count of the original loop. 723 Value *TripCount = nullptr; 724 725 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 726 Value *VectorTripCount = nullptr; 727 728 /// The legality analysis. 729 LoopVectorizationLegality *Legal; 730 731 /// The profitablity analysis. 732 LoopVectorizationCostModel *Cost; 733 734 // Record whether runtime checks are added. 735 bool AddedSafetyChecks = false; 736 737 // Holds the end values for each induction variable. We save the end values 738 // so we can later fix-up the external users of the induction variables. 739 DenseMap<PHINode *, Value *> IVEndValues; 740 741 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 742 // fixed up at the end of vector code generation. 743 SmallVector<PHINode *, 8> OrigPHIsToFix; 744 }; 745 746 class InnerLoopUnroller : public InnerLoopVectorizer { 747 public: 748 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 749 LoopInfo *LI, DominatorTree *DT, 750 const TargetLibraryInfo *TLI, 751 const TargetTransformInfo *TTI, AssumptionCache *AC, 752 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 753 LoopVectorizationLegality *LVL, 754 LoopVectorizationCostModel *CM) 755 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 756 UnrollFactor, LVL, CM) {} 757 758 private: 759 Value *getBroadcastInstrs(Value *V) override; 760 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 761 Instruction::BinaryOps Opcode = 762 Instruction::BinaryOpsEnd) override; 763 Value *reverseVector(Value *Vec) override; 764 }; 765 766 } // end namespace llvm 767 768 /// Look for a meaningful debug location on the instruction or it's 769 /// operands. 770 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 771 if (!I) 772 return I; 773 774 DebugLoc Empty; 775 if (I->getDebugLoc() != Empty) 776 return I; 777 778 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 779 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 780 if (OpInst->getDebugLoc() != Empty) 781 return OpInst; 782 } 783 784 return I; 785 } 786 787 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 788 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 789 const DILocation *DIL = Inst->getDebugLoc(); 790 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 791 !isa<DbgInfoIntrinsic>(Inst)) { 792 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 793 if (NewDIL) 794 B.SetCurrentDebugLocation(NewDIL.getValue()); 795 else 796 LLVM_DEBUG(dbgs() 797 << "Failed to create new discriminator: " 798 << DIL->getFilename() << " Line: " << DIL->getLine()); 799 } 800 else 801 B.SetCurrentDebugLocation(DIL); 802 } else 803 B.SetCurrentDebugLocation(DebugLoc()); 804 } 805 806 /// Write a record \p DebugMsg about vectorization failure to the debug 807 /// output stream. If \p I is passed, it is an instruction that prevents 808 /// vectorization. 809 #ifndef NDEBUG 810 static void debugVectorizationFailure(const StringRef DebugMsg, 811 Instruction *I) { 812 dbgs() << "LV: Not vectorizing: " << DebugMsg; 813 if (I != nullptr) 814 dbgs() << " " << *I; 815 else 816 dbgs() << '.'; 817 dbgs() << '\n'; 818 } 819 #endif 820 821 /// Create an analysis remark that explains why vectorization failed 822 /// 823 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 824 /// RemarkName is the identifier for the remark. If \p I is passed it is an 825 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 826 /// the location of the remark. \return the remark object that can be 827 /// streamed to. 828 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 829 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 830 Value *CodeRegion = TheLoop->getHeader(); 831 DebugLoc DL = TheLoop->getStartLoc(); 832 833 if (I) { 834 CodeRegion = I->getParent(); 835 // If there is no debug location attached to the instruction, revert back to 836 // using the loop's. 837 if (I->getDebugLoc()) 838 DL = I->getDebugLoc(); 839 } 840 841 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 842 R << "loop not vectorized: "; 843 return R; 844 } 845 846 namespace llvm { 847 848 void reportVectorizationFailure(const StringRef DebugMsg, 849 const StringRef OREMsg, const StringRef ORETag, 850 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 851 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 852 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 853 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 854 ORETag, TheLoop, I) << OREMsg); 855 } 856 857 } // end namespace llvm 858 859 #ifndef NDEBUG 860 /// \return string containing a file name and a line # for the given loop. 861 static std::string getDebugLocString(const Loop *L) { 862 std::string Result; 863 if (L) { 864 raw_string_ostream OS(Result); 865 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 866 LoopDbgLoc.print(OS); 867 else 868 // Just print the module name. 869 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 870 OS.flush(); 871 } 872 return Result; 873 } 874 #endif 875 876 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 877 const Instruction *Orig) { 878 // If the loop was versioned with memchecks, add the corresponding no-alias 879 // metadata. 880 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 881 LVer->annotateInstWithNoAlias(To, Orig); 882 } 883 884 void InnerLoopVectorizer::addMetadata(Instruction *To, 885 Instruction *From) { 886 propagateMetadata(To, From); 887 addNewMetadata(To, From); 888 } 889 890 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 891 Instruction *From) { 892 for (Value *V : To) { 893 if (Instruction *I = dyn_cast<Instruction>(V)) 894 addMetadata(I, From); 895 } 896 } 897 898 namespace llvm { 899 900 // Loop vectorization cost-model hints how the scalar epilogue loop should be 901 // lowered. 902 enum ScalarEpilogueLowering { 903 904 // The default: allowing scalar epilogues. 905 CM_ScalarEpilogueAllowed, 906 907 // Vectorization with OptForSize: don't allow epilogues. 908 CM_ScalarEpilogueNotAllowedOptSize, 909 910 // A special case of vectorisation with OptForSize: loops with a very small 911 // trip count are considered for vectorization under OptForSize, thereby 912 // making sure the cost of their loop body is dominant, free of runtime 913 // guards and scalar iteration overheads. 914 CM_ScalarEpilogueNotAllowedLowTripLoop, 915 916 // Loop hint predicate indicating an epilogue is undesired. 917 CM_ScalarEpilogueNotNeededUsePredicate 918 }; 919 920 /// LoopVectorizationCostModel - estimates the expected speedups due to 921 /// vectorization. 922 /// In many cases vectorization is not profitable. This can happen because of 923 /// a number of reasons. In this class we mainly attempt to predict the 924 /// expected speedup/slowdowns due to the supported instruction set. We use the 925 /// TargetTransformInfo to query the different backends for the cost of 926 /// different operations. 927 class LoopVectorizationCostModel { 928 public: 929 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 930 PredicatedScalarEvolution &PSE, LoopInfo *LI, 931 LoopVectorizationLegality *Legal, 932 const TargetTransformInfo &TTI, 933 const TargetLibraryInfo *TLI, DemandedBits *DB, 934 AssumptionCache *AC, 935 OptimizationRemarkEmitter *ORE, const Function *F, 936 const LoopVectorizeHints *Hints, 937 InterleavedAccessInfo &IAI) 938 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 939 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 940 Hints(Hints), InterleaveInfo(IAI) {} 941 942 /// \return An upper bound for the vectorization factor, or None if 943 /// vectorization and interleaving should be avoided up front. 944 Optional<unsigned> computeMaxVF(); 945 946 /// \return True if runtime checks are required for vectorization, and false 947 /// otherwise. 948 bool runtimeChecksRequired(); 949 950 /// \return The most profitable vectorization factor and the cost of that VF. 951 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 952 /// then this vectorization factor will be selected if vectorization is 953 /// possible. 954 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 955 956 /// Setup cost-based decisions for user vectorization factor. 957 void selectUserVectorizationFactor(unsigned UserVF) { 958 collectUniformsAndScalars(UserVF); 959 collectInstsToScalarize(UserVF); 960 } 961 962 /// \return The size (in bits) of the smallest and widest types in the code 963 /// that needs to be vectorized. We ignore values that remain scalar such as 964 /// 64 bit loop indices. 965 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 966 967 /// \return The desired interleave count. 968 /// If interleave count has been specified by metadata it will be returned. 969 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 970 /// are the selected vectorization factor and the cost of the selected VF. 971 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 972 973 /// Memory access instruction may be vectorized in more than one way. 974 /// Form of instruction after vectorization depends on cost. 975 /// This function takes cost-based decisions for Load/Store instructions 976 /// and collects them in a map. This decisions map is used for building 977 /// the lists of loop-uniform and loop-scalar instructions. 978 /// The calculated cost is saved with widening decision in order to 979 /// avoid redundant calculations. 980 void setCostBasedWideningDecision(unsigned VF); 981 982 /// A struct that represents some properties of the register usage 983 /// of a loop. 984 struct RegisterUsage { 985 /// Holds the number of loop invariant values that are used in the loop. 986 unsigned LoopInvariantRegs; 987 988 /// Holds the maximum number of concurrent live intervals in the loop. 989 unsigned MaxLocalUsers; 990 }; 991 992 /// \return Returns information about the register usages of the loop for the 993 /// given vectorization factors. 994 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 995 996 /// Collect values we want to ignore in the cost model. 997 void collectValuesToIgnore(); 998 999 /// \returns The smallest bitwidth each instruction can be represented with. 1000 /// The vector equivalents of these instructions should be truncated to this 1001 /// type. 1002 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1003 return MinBWs; 1004 } 1005 1006 /// \returns True if it is more profitable to scalarize instruction \p I for 1007 /// vectorization factor \p VF. 1008 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1009 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1010 1011 // Cost model is not run in the VPlan-native path - return conservative 1012 // result until this changes. 1013 if (EnableVPlanNativePath) 1014 return false; 1015 1016 auto Scalars = InstsToScalarize.find(VF); 1017 assert(Scalars != InstsToScalarize.end() && 1018 "VF not yet analyzed for scalarization profitability"); 1019 return Scalars->second.find(I) != Scalars->second.end(); 1020 } 1021 1022 /// Returns true if \p I is known to be uniform after vectorization. 1023 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1024 if (VF == 1) 1025 return true; 1026 1027 // Cost model is not run in the VPlan-native path - return conservative 1028 // result until this changes. 1029 if (EnableVPlanNativePath) 1030 return false; 1031 1032 auto UniformsPerVF = Uniforms.find(VF); 1033 assert(UniformsPerVF != Uniforms.end() && 1034 "VF not yet analyzed for uniformity"); 1035 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); 1036 } 1037 1038 /// Returns true if \p I is known to be scalar after vectorization. 1039 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1040 if (VF == 1) 1041 return true; 1042 1043 // Cost model is not run in the VPlan-native path - return conservative 1044 // result until this changes. 1045 if (EnableVPlanNativePath) 1046 return false; 1047 1048 auto ScalarsPerVF = Scalars.find(VF); 1049 assert(ScalarsPerVF != Scalars.end() && 1050 "Scalar values are not calculated for VF"); 1051 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); 1052 } 1053 1054 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1055 /// for vectorization factor \p VF. 1056 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1057 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1058 !isProfitableToScalarize(I, VF) && 1059 !isScalarAfterVectorization(I, VF); 1060 } 1061 1062 /// Decision that was taken during cost calculation for memory instruction. 1063 enum InstWidening { 1064 CM_Unknown, 1065 CM_Widen, // For consecutive accesses with stride +1. 1066 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1067 CM_Interleave, 1068 CM_GatherScatter, 1069 CM_Scalarize 1070 }; 1071 1072 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1073 /// instruction \p I and vector width \p VF. 1074 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1075 unsigned Cost) { 1076 assert(VF >= 2 && "Expected VF >=2"); 1077 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1078 } 1079 1080 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1081 /// interleaving group \p Grp and vector width \p VF. 1082 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1083 InstWidening W, unsigned Cost) { 1084 assert(VF >= 2 && "Expected VF >=2"); 1085 /// Broadcast this decicion to all instructions inside the group. 1086 /// But the cost will be assigned to one instruction only. 1087 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1088 if (auto *I = Grp->getMember(i)) { 1089 if (Grp->getInsertPos() == I) 1090 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1091 else 1092 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1093 } 1094 } 1095 } 1096 1097 /// Return the cost model decision for the given instruction \p I and vector 1098 /// width \p VF. Return CM_Unknown if this instruction did not pass 1099 /// through the cost modeling. 1100 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1101 assert(VF >= 2 && "Expected VF >=2"); 1102 1103 // Cost model is not run in the VPlan-native path - return conservative 1104 // result until this changes. 1105 if (EnableVPlanNativePath) 1106 return CM_GatherScatter; 1107 1108 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1109 auto Itr = WideningDecisions.find(InstOnVF); 1110 if (Itr == WideningDecisions.end()) 1111 return CM_Unknown; 1112 return Itr->second.first; 1113 } 1114 1115 /// Return the vectorization cost for the given instruction \p I and vector 1116 /// width \p VF. 1117 unsigned getWideningCost(Instruction *I, unsigned VF) { 1118 assert(VF >= 2 && "Expected VF >=2"); 1119 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1120 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1121 "The cost is not calculated"); 1122 return WideningDecisions[InstOnVF].second; 1123 } 1124 1125 /// Return True if instruction \p I is an optimizable truncate whose operand 1126 /// is an induction variable. Such a truncate will be removed by adding a new 1127 /// induction variable with the destination type. 1128 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1129 // If the instruction is not a truncate, return false. 1130 auto *Trunc = dyn_cast<TruncInst>(I); 1131 if (!Trunc) 1132 return false; 1133 1134 // Get the source and destination types of the truncate. 1135 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1136 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1137 1138 // If the truncate is free for the given types, return false. Replacing a 1139 // free truncate with an induction variable would add an induction variable 1140 // update instruction to each iteration of the loop. We exclude from this 1141 // check the primary induction variable since it will need an update 1142 // instruction regardless. 1143 Value *Op = Trunc->getOperand(0); 1144 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1145 return false; 1146 1147 // If the truncated value is not an induction variable, return false. 1148 return Legal->isInductionPhi(Op); 1149 } 1150 1151 /// Collects the instructions to scalarize for each predicated instruction in 1152 /// the loop. 1153 void collectInstsToScalarize(unsigned VF); 1154 1155 /// Collect Uniform and Scalar values for the given \p VF. 1156 /// The sets depend on CM decision for Load/Store instructions 1157 /// that may be vectorized as interleave, gather-scatter or scalarized. 1158 void collectUniformsAndScalars(unsigned VF) { 1159 // Do the analysis once. 1160 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1161 return; 1162 setCostBasedWideningDecision(VF); 1163 collectLoopUniforms(VF); 1164 collectLoopScalars(VF); 1165 } 1166 1167 /// Returns true if the target machine supports masked store operation 1168 /// for the given \p DataType and kind of access to \p Ptr. 1169 bool isLegalMaskedStore(Type *DataType, Value *Ptr) { 1170 return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType); 1171 } 1172 1173 /// Returns true if the target machine supports masked load operation 1174 /// for the given \p DataType and kind of access to \p Ptr. 1175 bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { 1176 return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType); 1177 } 1178 1179 /// Returns true if the target machine supports masked scatter operation 1180 /// for the given \p DataType. 1181 bool isLegalMaskedScatter(Type *DataType) { 1182 return TTI.isLegalMaskedScatter(DataType); 1183 } 1184 1185 /// Returns true if the target machine supports masked gather operation 1186 /// for the given \p DataType. 1187 bool isLegalMaskedGather(Type *DataType) { 1188 return TTI.isLegalMaskedGather(DataType); 1189 } 1190 1191 /// Returns true if the target machine can represent \p V as a masked gather 1192 /// or scatter operation. 1193 bool isLegalGatherOrScatter(Value *V) { 1194 bool LI = isa<LoadInst>(V); 1195 bool SI = isa<StoreInst>(V); 1196 if (!LI && !SI) 1197 return false; 1198 auto *Ty = getMemInstValueType(V); 1199 return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); 1200 } 1201 1202 /// Returns true if \p I is an instruction that will be scalarized with 1203 /// predication. Such instructions include conditional stores and 1204 /// instructions that may divide by zero. 1205 /// If a non-zero VF has been calculated, we check if I will be scalarized 1206 /// predication for that VF. 1207 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1208 1209 // Returns true if \p I is an instruction that will be predicated either 1210 // through scalar predication or masked load/store or masked gather/scatter. 1211 // Superset of instructions that return true for isScalarWithPredication. 1212 bool isPredicatedInst(Instruction *I) { 1213 if (!blockNeedsPredication(I->getParent())) 1214 return false; 1215 // Loads and stores that need some form of masked operation are predicated 1216 // instructions. 1217 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1218 return Legal->isMaskRequired(I); 1219 return isScalarWithPredication(I); 1220 } 1221 1222 /// Returns true if \p I is a memory instruction with consecutive memory 1223 /// access that can be widened. 1224 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1225 1226 /// Returns true if \p I is a memory instruction in an interleaved-group 1227 /// of memory accesses that can be vectorized with wide vector loads/stores 1228 /// and shuffles. 1229 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1230 1231 /// Check if \p Instr belongs to any interleaved access group. 1232 bool isAccessInterleaved(Instruction *Instr) { 1233 return InterleaveInfo.isInterleaved(Instr); 1234 } 1235 1236 /// Get the interleaved access group that \p Instr belongs to. 1237 const InterleaveGroup<Instruction> * 1238 getInterleavedAccessGroup(Instruction *Instr) { 1239 return InterleaveInfo.getInterleaveGroup(Instr); 1240 } 1241 1242 /// Returns true if an interleaved group requires a scalar iteration 1243 /// to handle accesses with gaps, and there is nothing preventing us from 1244 /// creating a scalar epilogue. 1245 bool requiresScalarEpilogue() const { 1246 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1247 } 1248 1249 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1250 /// loop hint annotation. 1251 bool isScalarEpilogueAllowed() const { 1252 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1253 } 1254 1255 /// Returns true if all loop blocks should be masked to fold tail loop. 1256 bool foldTailByMasking() const { return FoldTailByMasking; } 1257 1258 bool blockNeedsPredication(BasicBlock *BB) { 1259 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1260 } 1261 1262 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1263 /// with factor VF. Return the cost of the instruction, including 1264 /// scalarization overhead if it's needed. 1265 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1266 1267 /// Estimate cost of a call instruction CI if it were vectorized with factor 1268 /// VF. Return the cost of the instruction, including scalarization overhead 1269 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1270 /// scalarized - 1271 /// i.e. either vector version isn't available, or is too expensive. 1272 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1273 1274 private: 1275 unsigned NumPredStores = 0; 1276 1277 /// \return An upper bound for the vectorization factor, larger than zero. 1278 /// One is returned if vectorization should best be avoided due to cost. 1279 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1280 1281 /// The vectorization cost is a combination of the cost itself and a boolean 1282 /// indicating whether any of the contributing operations will actually 1283 /// operate on 1284 /// vector values after type legalization in the backend. If this latter value 1285 /// is 1286 /// false, then all operations will be scalarized (i.e. no vectorization has 1287 /// actually taken place). 1288 using VectorizationCostTy = std::pair<unsigned, bool>; 1289 1290 /// Returns the expected execution cost. The unit of the cost does 1291 /// not matter because we use the 'cost' units to compare different 1292 /// vector widths. The cost that is returned is *not* normalized by 1293 /// the factor width. 1294 VectorizationCostTy expectedCost(unsigned VF); 1295 1296 /// Returns the execution time cost of an instruction for a given vector 1297 /// width. Vector width of one means scalar. 1298 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1299 1300 /// The cost-computation logic from getInstructionCost which provides 1301 /// the vector type as an output parameter. 1302 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1303 1304 /// Calculate vectorization cost of memory instruction \p I. 1305 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1306 1307 /// The cost computation for scalarized memory instruction. 1308 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1309 1310 /// The cost computation for interleaving group of memory instructions. 1311 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1312 1313 /// The cost computation for Gather/Scatter instruction. 1314 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1315 1316 /// The cost computation for widening instruction \p I with consecutive 1317 /// memory access. 1318 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1319 1320 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1321 /// Load: scalar load + broadcast. 1322 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1323 /// element) 1324 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1325 1326 /// Estimate the overhead of scalarizing an instruction. This is a 1327 /// convenience wrapper for the type-based getScalarizationOverhead API. 1328 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1329 1330 /// Returns whether the instruction is a load or store and will be a emitted 1331 /// as a vector operation. 1332 bool isConsecutiveLoadOrStore(Instruction *I); 1333 1334 /// Returns true if an artificially high cost for emulated masked memrefs 1335 /// should be used. 1336 bool useEmulatedMaskMemRefHack(Instruction *I); 1337 1338 /// Map of scalar integer values to the smallest bitwidth they can be legally 1339 /// represented as. The vector equivalents of these values should be truncated 1340 /// to this type. 1341 MapVector<Instruction *, uint64_t> MinBWs; 1342 1343 /// A type representing the costs for instructions if they were to be 1344 /// scalarized rather than vectorized. The entries are Instruction-Cost 1345 /// pairs. 1346 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1347 1348 /// A set containing all BasicBlocks that are known to present after 1349 /// vectorization as a predicated block. 1350 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1351 1352 /// Records whether it is allowed to have the original scalar loop execute at 1353 /// least once. This may be needed as a fallback loop in case runtime 1354 /// aliasing/dependence checks fail, or to handle the tail/remainder 1355 /// iterations when the trip count is unknown or doesn't divide by the VF, 1356 /// or as a peel-loop to handle gaps in interleave-groups. 1357 /// Under optsize and when the trip count is very small we don't allow any 1358 /// iterations to execute in the scalar loop. 1359 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1360 1361 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1362 bool FoldTailByMasking = false; 1363 1364 /// A map holding scalar costs for different vectorization factors. The 1365 /// presence of a cost for an instruction in the mapping indicates that the 1366 /// instruction will be scalarized when vectorizing with the associated 1367 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1368 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1369 1370 /// Holds the instructions known to be uniform after vectorization. 1371 /// The data is collected per VF. 1372 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1373 1374 /// Holds the instructions known to be scalar after vectorization. 1375 /// The data is collected per VF. 1376 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1377 1378 /// Holds the instructions (address computations) that are forced to be 1379 /// scalarized. 1380 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1381 1382 /// Returns the expected difference in cost from scalarizing the expression 1383 /// feeding a predicated instruction \p PredInst. The instructions to 1384 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1385 /// non-negative return value implies the expression will be scalarized. 1386 /// Currently, only single-use chains are considered for scalarization. 1387 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1388 unsigned VF); 1389 1390 /// Collect the instructions that are uniform after vectorization. An 1391 /// instruction is uniform if we represent it with a single scalar value in 1392 /// the vectorized loop corresponding to each vector iteration. Examples of 1393 /// uniform instructions include pointer operands of consecutive or 1394 /// interleaved memory accesses. Note that although uniformity implies an 1395 /// instruction will be scalar, the reverse is not true. In general, a 1396 /// scalarized instruction will be represented by VF scalar values in the 1397 /// vectorized loop, each corresponding to an iteration of the original 1398 /// scalar loop. 1399 void collectLoopUniforms(unsigned VF); 1400 1401 /// Collect the instructions that are scalar after vectorization. An 1402 /// instruction is scalar if it is known to be uniform or will be scalarized 1403 /// during vectorization. Non-uniform scalarized instructions will be 1404 /// represented by VF values in the vectorized loop, each corresponding to an 1405 /// iteration of the original scalar loop. 1406 void collectLoopScalars(unsigned VF); 1407 1408 /// Keeps cost model vectorization decision and cost for instructions. 1409 /// Right now it is used for memory instructions only. 1410 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1411 std::pair<InstWidening, unsigned>>; 1412 1413 DecisionList WideningDecisions; 1414 1415 /// Returns true if \p V is expected to be vectorized and it needs to be 1416 /// extracted. 1417 bool needsExtract(Value *V, unsigned VF) const { 1418 Instruction *I = dyn_cast<Instruction>(V); 1419 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1420 return false; 1421 1422 // Assume we can vectorize V (and hence we need extraction) if the 1423 // scalars are not computed yet. This can happen, because it is called 1424 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1425 // the scalars are collected. That should be a safe assumption in most 1426 // cases, because we check if the operands have vectorizable types 1427 // beforehand in LoopVectorizationLegality. 1428 return Scalars.find(VF) == Scalars.end() || 1429 !isScalarAfterVectorization(I, VF); 1430 }; 1431 1432 /// Returns a range containing only operands needing to be extracted. 1433 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1434 unsigned VF) { 1435 return SmallVector<Value *, 4>(make_filter_range( 1436 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1437 } 1438 1439 public: 1440 /// The loop that we evaluate. 1441 Loop *TheLoop; 1442 1443 /// Predicated scalar evolution analysis. 1444 PredicatedScalarEvolution &PSE; 1445 1446 /// Loop Info analysis. 1447 LoopInfo *LI; 1448 1449 /// Vectorization legality. 1450 LoopVectorizationLegality *Legal; 1451 1452 /// Vector target information. 1453 const TargetTransformInfo &TTI; 1454 1455 /// Target Library Info. 1456 const TargetLibraryInfo *TLI; 1457 1458 /// Demanded bits analysis. 1459 DemandedBits *DB; 1460 1461 /// Assumption cache. 1462 AssumptionCache *AC; 1463 1464 /// Interface to emit optimization remarks. 1465 OptimizationRemarkEmitter *ORE; 1466 1467 const Function *TheFunction; 1468 1469 /// Loop Vectorize Hint. 1470 const LoopVectorizeHints *Hints; 1471 1472 /// The interleave access information contains groups of interleaved accesses 1473 /// with the same stride and close to each other. 1474 InterleavedAccessInfo &InterleaveInfo; 1475 1476 /// Values to ignore in the cost model. 1477 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1478 1479 /// Values to ignore in the cost model when VF > 1. 1480 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1481 }; 1482 1483 } // end namespace llvm 1484 1485 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1486 // vectorization. The loop needs to be annotated with #pragma omp simd 1487 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1488 // vector length information is not provided, vectorization is not considered 1489 // explicit. Interleave hints are not allowed either. These limitations will be 1490 // relaxed in the future. 1491 // Please, note that we are currently forced to abuse the pragma 'clang 1492 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1493 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1494 // provides *explicit vectorization hints* (LV can bypass legal checks and 1495 // assume that vectorization is legal). However, both hints are implemented 1496 // using the same metadata (llvm.loop.vectorize, processed by 1497 // LoopVectorizeHints). This will be fixed in the future when the native IR 1498 // representation for pragma 'omp simd' is introduced. 1499 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1500 OptimizationRemarkEmitter *ORE) { 1501 assert(!OuterLp->empty() && "This is not an outer loop"); 1502 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1503 1504 // Only outer loops with an explicit vectorization hint are supported. 1505 // Unannotated outer loops are ignored. 1506 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1507 return false; 1508 1509 Function *Fn = OuterLp->getHeader()->getParent(); 1510 if (!Hints.allowVectorization(Fn, OuterLp, 1511 true /*VectorizeOnlyWhenForced*/)) { 1512 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1513 return false; 1514 } 1515 1516 if (Hints.getInterleave() > 1) { 1517 // TODO: Interleave support is future work. 1518 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1519 "outer loops.\n"); 1520 Hints.emitRemarkWithHints(); 1521 return false; 1522 } 1523 1524 return true; 1525 } 1526 1527 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1528 OptimizationRemarkEmitter *ORE, 1529 SmallVectorImpl<Loop *> &V) { 1530 // Collect inner loops and outer loops without irreducible control flow. For 1531 // now, only collect outer loops that have explicit vectorization hints. If we 1532 // are stress testing the VPlan H-CFG construction, we collect the outermost 1533 // loop of every loop nest. 1534 if (L.empty() || VPlanBuildStressTest || 1535 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1536 LoopBlocksRPO RPOT(&L); 1537 RPOT.perform(LI); 1538 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1539 V.push_back(&L); 1540 // TODO: Collect inner loops inside marked outer loops in case 1541 // vectorization fails for the outer loop. Do not invoke 1542 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1543 // already known to be reducible. We can use an inherited attribute for 1544 // that. 1545 return; 1546 } 1547 } 1548 for (Loop *InnerL : L) 1549 collectSupportedLoops(*InnerL, LI, ORE, V); 1550 } 1551 1552 namespace { 1553 1554 /// The LoopVectorize Pass. 1555 struct LoopVectorize : public FunctionPass { 1556 /// Pass identification, replacement for typeid 1557 static char ID; 1558 1559 LoopVectorizePass Impl; 1560 1561 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1562 bool VectorizeOnlyWhenForced = false) 1563 : FunctionPass(ID) { 1564 Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; 1565 Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; 1566 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1567 } 1568 1569 bool runOnFunction(Function &F) override { 1570 if (skipFunction(F)) 1571 return false; 1572 1573 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1574 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1575 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1576 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1577 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1578 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1579 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1580 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1581 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1582 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1583 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1584 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1585 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1586 1587 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1588 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1589 1590 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1591 GetLAA, *ORE, PSI); 1592 } 1593 1594 void getAnalysisUsage(AnalysisUsage &AU) const override { 1595 AU.addRequired<AssumptionCacheTracker>(); 1596 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1597 AU.addRequired<DominatorTreeWrapperPass>(); 1598 AU.addRequired<LoopInfoWrapperPass>(); 1599 AU.addRequired<ScalarEvolutionWrapperPass>(); 1600 AU.addRequired<TargetTransformInfoWrapperPass>(); 1601 AU.addRequired<AAResultsWrapperPass>(); 1602 AU.addRequired<LoopAccessLegacyAnalysis>(); 1603 AU.addRequired<DemandedBitsWrapperPass>(); 1604 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1605 1606 // We currently do not preserve loopinfo/dominator analyses with outer loop 1607 // vectorization. Until this is addressed, mark these analyses as preserved 1608 // only for non-VPlan-native path. 1609 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1610 if (!EnableVPlanNativePath) { 1611 AU.addPreserved<LoopInfoWrapperPass>(); 1612 AU.addPreserved<DominatorTreeWrapperPass>(); 1613 } 1614 1615 AU.addPreserved<BasicAAWrapperPass>(); 1616 AU.addPreserved<GlobalsAAWrapperPass>(); 1617 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1618 } 1619 }; 1620 1621 } // end anonymous namespace 1622 1623 //===----------------------------------------------------------------------===// 1624 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1625 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1626 //===----------------------------------------------------------------------===// 1627 1628 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1629 // We need to place the broadcast of invariant variables outside the loop, 1630 // but only if it's proven safe to do so. Else, broadcast will be inside 1631 // vector loop body. 1632 Instruction *Instr = dyn_cast<Instruction>(V); 1633 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1634 (!Instr || 1635 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1636 // Place the code for broadcasting invariant variables in the new preheader. 1637 IRBuilder<>::InsertPointGuard Guard(Builder); 1638 if (SafeToHoist) 1639 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1640 1641 // Broadcast the scalar into all locations in the vector. 1642 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1643 1644 return Shuf; 1645 } 1646 1647 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1648 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1649 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1650 "Expected either an induction phi-node or a truncate of it!"); 1651 Value *Start = II.getStartValue(); 1652 1653 // Construct the initial value of the vector IV in the vector loop preheader 1654 auto CurrIP = Builder.saveIP(); 1655 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1656 if (isa<TruncInst>(EntryVal)) { 1657 assert(Start->getType()->isIntegerTy() && 1658 "Truncation requires an integer type"); 1659 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1660 Step = Builder.CreateTrunc(Step, TruncType); 1661 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1662 } 1663 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1664 Value *SteppedStart = 1665 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1666 1667 // We create vector phi nodes for both integer and floating-point induction 1668 // variables. Here, we determine the kind of arithmetic we will perform. 1669 Instruction::BinaryOps AddOp; 1670 Instruction::BinaryOps MulOp; 1671 if (Step->getType()->isIntegerTy()) { 1672 AddOp = Instruction::Add; 1673 MulOp = Instruction::Mul; 1674 } else { 1675 AddOp = II.getInductionOpcode(); 1676 MulOp = Instruction::FMul; 1677 } 1678 1679 // Multiply the vectorization factor by the step using integer or 1680 // floating-point arithmetic as appropriate. 1681 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1682 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1683 1684 // Create a vector splat to use in the induction update. 1685 // 1686 // FIXME: If the step is non-constant, we create the vector splat with 1687 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1688 // handle a constant vector splat. 1689 Value *SplatVF = isa<Constant>(Mul) 1690 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 1691 : Builder.CreateVectorSplat(VF, Mul); 1692 Builder.restoreIP(CurrIP); 1693 1694 // We may need to add the step a number of times, depending on the unroll 1695 // factor. The last of those goes into the PHI. 1696 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1697 &*LoopVectorBody->getFirstInsertionPt()); 1698 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1699 Instruction *LastInduction = VecInd; 1700 for (unsigned Part = 0; Part < UF; ++Part) { 1701 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1702 1703 if (isa<TruncInst>(EntryVal)) 1704 addMetadata(LastInduction, EntryVal); 1705 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1706 1707 LastInduction = cast<Instruction>(addFastMathFlag( 1708 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1709 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1710 } 1711 1712 // Move the last step to the end of the latch block. This ensures consistent 1713 // placement of all induction updates. 1714 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1715 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1716 auto *ICmp = cast<Instruction>(Br->getCondition()); 1717 LastInduction->moveBefore(ICmp); 1718 LastInduction->setName("vec.ind.next"); 1719 1720 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1721 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1722 } 1723 1724 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1725 return Cost->isScalarAfterVectorization(I, VF) || 1726 Cost->isProfitableToScalarize(I, VF); 1727 } 1728 1729 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1730 if (shouldScalarizeInstruction(IV)) 1731 return true; 1732 auto isScalarInst = [&](User *U) -> bool { 1733 auto *I = cast<Instruction>(U); 1734 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1735 }; 1736 return llvm::any_of(IV->users(), isScalarInst); 1737 } 1738 1739 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1740 const InductionDescriptor &ID, const Instruction *EntryVal, 1741 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1742 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1743 "Expected either an induction phi-node or a truncate of it!"); 1744 1745 // This induction variable is not the phi from the original loop but the 1746 // newly-created IV based on the proof that casted Phi is equal to the 1747 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1748 // re-uses the same InductionDescriptor that original IV uses but we don't 1749 // have to do any recording in this case - that is done when original IV is 1750 // processed. 1751 if (isa<TruncInst>(EntryVal)) 1752 return; 1753 1754 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1755 if (Casts.empty()) 1756 return; 1757 // Only the first Cast instruction in the Casts vector is of interest. 1758 // The rest of the Casts (if exist) have no uses outside the 1759 // induction update chain itself. 1760 Instruction *CastInst = *Casts.begin(); 1761 if (Lane < UINT_MAX) 1762 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1763 else 1764 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1765 } 1766 1767 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1768 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1769 "Primary induction variable must have an integer type"); 1770 1771 auto II = Legal->getInductionVars()->find(IV); 1772 assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); 1773 1774 auto ID = II->second; 1775 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1776 1777 // The scalar value to broadcast. This will be derived from the canonical 1778 // induction variable. 1779 Value *ScalarIV = nullptr; 1780 1781 // The value from the original loop to which we are mapping the new induction 1782 // variable. 1783 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1784 1785 // True if we have vectorized the induction variable. 1786 auto VectorizedIV = false; 1787 1788 // Determine if we want a scalar version of the induction variable. This is 1789 // true if the induction variable itself is not widened, or if it has at 1790 // least one user in the loop that is not widened. 1791 auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); 1792 1793 // Generate code for the induction step. Note that induction steps are 1794 // required to be loop-invariant 1795 assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && 1796 "Induction step should be loop invariant"); 1797 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1798 Value *Step = nullptr; 1799 if (PSE.getSE()->isSCEVable(IV->getType())) { 1800 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1801 Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), 1802 LoopVectorPreHeader->getTerminator()); 1803 } else { 1804 Step = cast<SCEVUnknown>(ID.getStep())->getValue(); 1805 } 1806 1807 // Try to create a new independent vector induction variable. If we can't 1808 // create the phi node, we will splat the scalar induction variable in each 1809 // loop iteration. 1810 if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) { 1811 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1812 VectorizedIV = true; 1813 } 1814 1815 // If we haven't yet vectorized the induction variable, or if we will create 1816 // a scalar one, we need to define the scalar induction variable and step 1817 // values. If we were given a truncation type, truncate the canonical 1818 // induction variable and step. Otherwise, derive these values from the 1819 // induction descriptor. 1820 if (!VectorizedIV || NeedsScalarIV) { 1821 ScalarIV = Induction; 1822 if (IV != OldInduction) { 1823 ScalarIV = IV->getType()->isIntegerTy() 1824 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1825 : Builder.CreateCast(Instruction::SIToFP, Induction, 1826 IV->getType()); 1827 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1828 ScalarIV->setName("offset.idx"); 1829 } 1830 if (Trunc) { 1831 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1832 assert(Step->getType()->isIntegerTy() && 1833 "Truncation requires an integer step"); 1834 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1835 Step = Builder.CreateTrunc(Step, TruncType); 1836 } 1837 } 1838 1839 // If we haven't yet vectorized the induction variable, splat the scalar 1840 // induction variable, and build the necessary step vectors. 1841 // TODO: Don't do it unless the vectorized IV is really required. 1842 if (!VectorizedIV) { 1843 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1844 for (unsigned Part = 0; Part < UF; ++Part) { 1845 Value *EntryPart = 1846 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1847 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1848 if (Trunc) 1849 addMetadata(EntryPart, Trunc); 1850 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1851 } 1852 } 1853 1854 // If an induction variable is only used for counting loop iterations or 1855 // calculating addresses, it doesn't need to be widened. Create scalar steps 1856 // that can be used by instructions we will later scalarize. Note that the 1857 // addition of the scalar steps will not increase the number of instructions 1858 // in the loop in the common case prior to InstCombine. We will be trading 1859 // one vector extract for each scalar step. 1860 if (NeedsScalarIV) 1861 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1862 } 1863 1864 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1865 Instruction::BinaryOps BinOp) { 1866 // Create and check the types. 1867 assert(Val->getType()->isVectorTy() && "Must be a vector"); 1868 int VLen = Val->getType()->getVectorNumElements(); 1869 1870 Type *STy = Val->getType()->getScalarType(); 1871 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1872 "Induction Step must be an integer or FP"); 1873 assert(Step->getType() == STy && "Step has wrong type"); 1874 1875 SmallVector<Constant *, 8> Indices; 1876 1877 if (STy->isIntegerTy()) { 1878 // Create a vector of consecutive numbers from zero to VF. 1879 for (int i = 0; i < VLen; ++i) 1880 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1881 1882 // Add the consecutive indices to the vector value. 1883 Constant *Cv = ConstantVector::get(Indices); 1884 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1885 Step = Builder.CreateVectorSplat(VLen, Step); 1886 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1887 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1888 // which can be found from the original scalar operations. 1889 Step = Builder.CreateMul(Cv, Step); 1890 return Builder.CreateAdd(Val, Step, "induction"); 1891 } 1892 1893 // Floating point induction. 1894 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1895 "Binary Opcode should be specified for FP induction"); 1896 // Create a vector of consecutive numbers from zero to VF. 1897 for (int i = 0; i < VLen; ++i) 1898 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1899 1900 // Add the consecutive indices to the vector value. 1901 Constant *Cv = ConstantVector::get(Indices); 1902 1903 Step = Builder.CreateVectorSplat(VLen, Step); 1904 1905 // Floating point operations had to be 'fast' to enable the induction. 1906 FastMathFlags Flags; 1907 Flags.setFast(); 1908 1909 Value *MulOp = Builder.CreateFMul(Cv, Step); 1910 if (isa<Instruction>(MulOp)) 1911 // Have to check, MulOp may be a constant 1912 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1913 1914 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1915 if (isa<Instruction>(BOp)) 1916 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1917 return BOp; 1918 } 1919 1920 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1921 Instruction *EntryVal, 1922 const InductionDescriptor &ID) { 1923 // We shouldn't have to build scalar steps if we aren't vectorizing. 1924 assert(VF > 1 && "VF should be greater than one"); 1925 1926 // Get the value type and ensure it and the step have the same integer type. 1927 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1928 assert(ScalarIVTy == Step->getType() && 1929 "Val and Step should have the same type"); 1930 1931 // We build scalar steps for both integer and floating-point induction 1932 // variables. Here, we determine the kind of arithmetic we will perform. 1933 Instruction::BinaryOps AddOp; 1934 Instruction::BinaryOps MulOp; 1935 if (ScalarIVTy->isIntegerTy()) { 1936 AddOp = Instruction::Add; 1937 MulOp = Instruction::Mul; 1938 } else { 1939 AddOp = ID.getInductionOpcode(); 1940 MulOp = Instruction::FMul; 1941 } 1942 1943 // Determine the number of scalars we need to generate for each unroll 1944 // iteration. If EntryVal is uniform, we only need to generate the first 1945 // lane. Otherwise, we generate all VF values. 1946 unsigned Lanes = 1947 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 1948 : VF; 1949 // Compute the scalar steps and save the results in VectorLoopValueMap. 1950 for (unsigned Part = 0; Part < UF; ++Part) { 1951 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 1952 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 1953 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 1954 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 1955 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 1956 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 1957 } 1958 } 1959 } 1960 1961 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 1962 assert(V != Induction && "The new induction variable should not be used."); 1963 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 1964 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 1965 1966 // If we have a stride that is replaced by one, do it here. Defer this for 1967 // the VPlan-native path until we start running Legal checks in that path. 1968 if (!EnableVPlanNativePath && Legal->hasStride(V)) 1969 V = ConstantInt::get(V->getType(), 1); 1970 1971 // If we have a vector mapped to this value, return it. 1972 if (VectorLoopValueMap.hasVectorValue(V, Part)) 1973 return VectorLoopValueMap.getVectorValue(V, Part); 1974 1975 // If the value has not been vectorized, check if it has been scalarized 1976 // instead. If it has been scalarized, and we actually need the value in 1977 // vector form, we will construct the vector values on demand. 1978 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 1979 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 1980 1981 // If we've scalarized a value, that value should be an instruction. 1982 auto *I = cast<Instruction>(V); 1983 1984 // If we aren't vectorizing, we can just copy the scalar map values over to 1985 // the vector map. 1986 if (VF == 1) { 1987 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 1988 return ScalarValue; 1989 } 1990 1991 // Get the last scalar instruction we generated for V and Part. If the value 1992 // is known to be uniform after vectorization, this corresponds to lane zero 1993 // of the Part unroll iteration. Otherwise, the last instruction is the one 1994 // we created for the last vector lane of the Part unroll iteration. 1995 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 1996 auto *LastInst = cast<Instruction>( 1997 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 1998 1999 // Set the insert point after the last scalarized instruction. This ensures 2000 // the insertelement sequence will directly follow the scalar definitions. 2001 auto OldIP = Builder.saveIP(); 2002 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2003 Builder.SetInsertPoint(&*NewIP); 2004 2005 // However, if we are vectorizing, we need to construct the vector values. 2006 // If the value is known to be uniform after vectorization, we can just 2007 // broadcast the scalar value corresponding to lane zero for each unroll 2008 // iteration. Otherwise, we construct the vector values using insertelement 2009 // instructions. Since the resulting vectors are stored in 2010 // VectorLoopValueMap, we will only generate the insertelements once. 2011 Value *VectorValue = nullptr; 2012 if (Cost->isUniformAfterVectorization(I, VF)) { 2013 VectorValue = getBroadcastInstrs(ScalarValue); 2014 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2015 } else { 2016 // Initialize packing with insertelements to start from undef. 2017 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2018 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2019 for (unsigned Lane = 0; Lane < VF; ++Lane) 2020 packScalarIntoVectorValue(V, {Part, Lane}); 2021 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2022 } 2023 Builder.restoreIP(OldIP); 2024 return VectorValue; 2025 } 2026 2027 // If this scalar is unknown, assume that it is a constant or that it is 2028 // loop invariant. Broadcast V and save the value for future uses. 2029 Value *B = getBroadcastInstrs(V); 2030 VectorLoopValueMap.setVectorValue(V, Part, B); 2031 return B; 2032 } 2033 2034 Value * 2035 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2036 const VPIteration &Instance) { 2037 // If the value is not an instruction contained in the loop, it should 2038 // already be scalar. 2039 if (OrigLoop->isLoopInvariant(V)) 2040 return V; 2041 2042 assert(Instance.Lane > 0 2043 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2044 : true && "Uniform values only have lane zero"); 2045 2046 // If the value from the original loop has not been vectorized, it is 2047 // represented by UF x VF scalar values in the new loop. Return the requested 2048 // scalar value. 2049 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2050 return VectorLoopValueMap.getScalarValue(V, Instance); 2051 2052 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2053 // for the given unroll part. If this entry is not a vector type (i.e., the 2054 // vectorization factor is one), there is no need to generate an 2055 // extractelement instruction. 2056 auto *U = getOrCreateVectorValue(V, Instance.Part); 2057 if (!U->getType()->isVectorTy()) { 2058 assert(VF == 1 && "Value not scalarized has non-vector type"); 2059 return U; 2060 } 2061 2062 // Otherwise, the value from the original loop has been vectorized and is 2063 // represented by UF vector values. Extract and return the requested scalar 2064 // value from the appropriate vector lane. 2065 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2066 } 2067 2068 void InnerLoopVectorizer::packScalarIntoVectorValue( 2069 Value *V, const VPIteration &Instance) { 2070 assert(V != Induction && "The new induction variable should not be used."); 2071 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2072 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2073 2074 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2075 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2076 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2077 Builder.getInt32(Instance.Lane)); 2078 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2079 } 2080 2081 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2082 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2083 SmallVector<Constant *, 8> ShuffleMask; 2084 for (unsigned i = 0; i < VF; ++i) 2085 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 2086 2087 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2088 ConstantVector::get(ShuffleMask), 2089 "reverse"); 2090 } 2091 2092 // Return whether we allow using masked interleave-groups (for dealing with 2093 // strided loads/stores that reside in predicated blocks, or for dealing 2094 // with gaps). 2095 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2096 // If an override option has been passed in for interleaved accesses, use it. 2097 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2098 return EnableMaskedInterleavedMemAccesses; 2099 2100 return TTI.enableMaskedInterleavedAccessVectorization(); 2101 } 2102 2103 // Try to vectorize the interleave group that \p Instr belongs to. 2104 // 2105 // E.g. Translate following interleaved load group (factor = 3): 2106 // for (i = 0; i < N; i+=3) { 2107 // R = Pic[i]; // Member of index 0 2108 // G = Pic[i+1]; // Member of index 1 2109 // B = Pic[i+2]; // Member of index 2 2110 // ... // do something to R, G, B 2111 // } 2112 // To: 2113 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2114 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2115 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2116 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2117 // 2118 // Or translate following interleaved store group (factor = 3): 2119 // for (i = 0; i < N; i+=3) { 2120 // ... do something to R, G, B 2121 // Pic[i] = R; // Member of index 0 2122 // Pic[i+1] = G; // Member of index 1 2123 // Pic[i+2] = B; // Member of index 2 2124 // } 2125 // To: 2126 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2127 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2128 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2129 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2130 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2131 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, 2132 VectorParts *BlockInMask) { 2133 const InterleaveGroup<Instruction> *Group = 2134 Cost->getInterleavedAccessGroup(Instr); 2135 assert(Group && "Fail to get an interleaved access group."); 2136 2137 // Skip if current instruction is not the insert position. 2138 if (Instr != Group->getInsertPos()) 2139 return; 2140 2141 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2142 Value *Ptr = getLoadStorePointerOperand(Instr); 2143 2144 // Prepare for the vector type of the interleaved load/store. 2145 Type *ScalarTy = getMemInstValueType(Instr); 2146 unsigned InterleaveFactor = Group->getFactor(); 2147 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2148 Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr)); 2149 2150 // Prepare for the new pointers. 2151 setDebugLocFromInst(Builder, Ptr); 2152 SmallVector<Value *, 2> NewPtrs; 2153 unsigned Index = Group->getIndex(Instr); 2154 2155 VectorParts Mask; 2156 bool IsMaskForCondRequired = BlockInMask; 2157 if (IsMaskForCondRequired) { 2158 Mask = *BlockInMask; 2159 // TODO: extend the masked interleaved-group support to reversed access. 2160 assert(!Group->isReverse() && "Reversed masked interleave-group " 2161 "not supported."); 2162 } 2163 2164 // If the group is reverse, adjust the index to refer to the last vector lane 2165 // instead of the first. We adjust the index from the first vector lane, 2166 // rather than directly getting the pointer for lane VF - 1, because the 2167 // pointer operand of the interleaved access is supposed to be uniform. For 2168 // uniform instructions, we're only required to generate a value for the 2169 // first vector lane in each unroll iteration. 2170 if (Group->isReverse()) 2171 Index += (VF - 1) * Group->getFactor(); 2172 2173 bool InBounds = false; 2174 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2175 InBounds = gep->isInBounds(); 2176 2177 for (unsigned Part = 0; Part < UF; Part++) { 2178 Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); 2179 2180 // Notice current instruction could be any index. Need to adjust the address 2181 // to the member of index 0. 2182 // 2183 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2184 // b = A[i]; // Member of index 0 2185 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2186 // 2187 // E.g. A[i+1] = a; // Member of index 1 2188 // A[i] = b; // Member of index 0 2189 // A[i+2] = c; // Member of index 2 (Current instruction) 2190 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2191 NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index)); 2192 if (InBounds) 2193 cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true); 2194 2195 // Cast to the vector pointer type. 2196 NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); 2197 } 2198 2199 setDebugLocFromInst(Builder, Instr); 2200 Value *UndefVec = UndefValue::get(VecTy); 2201 2202 Value *MaskForGaps = nullptr; 2203 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2204 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2205 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2206 } 2207 2208 // Vectorize the interleaved load group. 2209 if (isa<LoadInst>(Instr)) { 2210 // For each unroll part, create a wide load for the group. 2211 SmallVector<Value *, 2> NewLoads; 2212 for (unsigned Part = 0; Part < UF; Part++) { 2213 Instruction *NewLoad; 2214 if (IsMaskForCondRequired || MaskForGaps) { 2215 assert(useMaskedInterleavedAccesses(*TTI) && 2216 "masked interleaved groups are not allowed."); 2217 Value *GroupMask = MaskForGaps; 2218 if (IsMaskForCondRequired) { 2219 auto *Undefs = UndefValue::get(Mask[Part]->getType()); 2220 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2221 Value *ShuffledMask = Builder.CreateShuffleVector( 2222 Mask[Part], Undefs, RepMask, "interleaved.mask"); 2223 GroupMask = MaskForGaps 2224 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2225 MaskForGaps) 2226 : ShuffledMask; 2227 } 2228 NewLoad = 2229 Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), 2230 GroupMask, UndefVec, "wide.masked.vec"); 2231 } 2232 else 2233 NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part], 2234 Group->getAlignment(), "wide.vec"); 2235 Group->addMetadata(NewLoad); 2236 NewLoads.push_back(NewLoad); 2237 } 2238 2239 // For each member in the group, shuffle out the appropriate data from the 2240 // wide loads. 2241 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2242 Instruction *Member = Group->getMember(I); 2243 2244 // Skip the gaps in the group. 2245 if (!Member) 2246 continue; 2247 2248 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); 2249 for (unsigned Part = 0; Part < UF; Part++) { 2250 Value *StridedVec = Builder.CreateShuffleVector( 2251 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2252 2253 // If this member has different type, cast the result type. 2254 if (Member->getType() != ScalarTy) { 2255 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2256 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2257 } 2258 2259 if (Group->isReverse()) 2260 StridedVec = reverseVector(StridedVec); 2261 2262 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2263 } 2264 } 2265 return; 2266 } 2267 2268 // The sub vector type for current instruction. 2269 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2270 2271 // Vectorize the interleaved store group. 2272 for (unsigned Part = 0; Part < UF; Part++) { 2273 // Collect the stored vector from each member. 2274 SmallVector<Value *, 4> StoredVecs; 2275 for (unsigned i = 0; i < InterleaveFactor; i++) { 2276 // Interleaved store group doesn't allow a gap, so each index has a member 2277 Instruction *Member = Group->getMember(i); 2278 assert(Member && "Fail to get a member from an interleaved store group"); 2279 2280 Value *StoredVec = getOrCreateVectorValue( 2281 cast<StoreInst>(Member)->getValueOperand(), Part); 2282 if (Group->isReverse()) 2283 StoredVec = reverseVector(StoredVec); 2284 2285 // If this member has different type, cast it to a unified type. 2286 2287 if (StoredVec->getType() != SubVT) 2288 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2289 2290 StoredVecs.push_back(StoredVec); 2291 } 2292 2293 // Concatenate all vectors into a wide vector. 2294 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2295 2296 // Interleave the elements in the wide vector. 2297 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); 2298 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, 2299 "interleaved.vec"); 2300 2301 Instruction *NewStoreInstr; 2302 if (IsMaskForCondRequired) { 2303 auto *Undefs = UndefValue::get(Mask[Part]->getType()); 2304 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2305 Value *ShuffledMask = Builder.CreateShuffleVector( 2306 Mask[Part], Undefs, RepMask, "interleaved.mask"); 2307 NewStoreInstr = Builder.CreateMaskedStore( 2308 IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask); 2309 } 2310 else 2311 NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], 2312 Group->getAlignment()); 2313 2314 Group->addMetadata(NewStoreInstr); 2315 } 2316 } 2317 2318 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2319 VectorParts *BlockInMask) { 2320 // Attempt to issue a wide load. 2321 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2322 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2323 2324 assert((LI || SI) && "Invalid Load/Store instruction"); 2325 2326 LoopVectorizationCostModel::InstWidening Decision = 2327 Cost->getWideningDecision(Instr, VF); 2328 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 2329 "CM decision should be taken at this point"); 2330 if (Decision == LoopVectorizationCostModel::CM_Interleave) 2331 return vectorizeInterleaveGroup(Instr); 2332 2333 Type *ScalarDataTy = getMemInstValueType(Instr); 2334 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2335 Value *Ptr = getLoadStorePointerOperand(Instr); 2336 unsigned Alignment = getLoadStoreAlignment(Instr); 2337 // An alignment of 0 means target abi alignment. We need to use the scalar's 2338 // target abi alignment in such a case. 2339 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2340 if (!Alignment) 2341 Alignment = DL.getABITypeAlignment(ScalarDataTy); 2342 unsigned AddressSpace = getLoadStoreAddressSpace(Instr); 2343 2344 // Determine if the pointer operand of the access is either consecutive or 2345 // reverse consecutive. 2346 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2347 bool ConsecutiveStride = 2348 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2349 bool CreateGatherScatter = 2350 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2351 2352 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2353 // gather/scatter. Otherwise Decision should have been to Scalarize. 2354 assert((ConsecutiveStride || CreateGatherScatter) && 2355 "The instruction should be scalarized"); 2356 2357 // Handle consecutive loads/stores. 2358 if (ConsecutiveStride) 2359 Ptr = getOrCreateScalarValue(Ptr, {0, 0}); 2360 2361 VectorParts Mask; 2362 bool isMaskRequired = BlockInMask; 2363 if (isMaskRequired) 2364 Mask = *BlockInMask; 2365 2366 bool InBounds = false; 2367 if (auto *gep = dyn_cast<GetElementPtrInst>( 2368 getLoadStorePointerOperand(Instr)->stripPointerCasts())) 2369 InBounds = gep->isInBounds(); 2370 2371 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2372 // Calculate the pointer for the specific unroll-part. 2373 GetElementPtrInst *PartPtr = nullptr; 2374 2375 if (Reverse) { 2376 // If the address is consecutive but reversed, then the 2377 // wide store needs to start at the last vector element. 2378 PartPtr = cast<GetElementPtrInst>( 2379 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2380 PartPtr->setIsInBounds(InBounds); 2381 PartPtr = cast<GetElementPtrInst>( 2382 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2383 PartPtr->setIsInBounds(InBounds); 2384 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2385 Mask[Part] = reverseVector(Mask[Part]); 2386 } else { 2387 PartPtr = cast<GetElementPtrInst>( 2388 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2389 PartPtr->setIsInBounds(InBounds); 2390 } 2391 2392 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2393 }; 2394 2395 // Handle Stores: 2396 if (SI) { 2397 setDebugLocFromInst(Builder, SI); 2398 2399 for (unsigned Part = 0; Part < UF; ++Part) { 2400 Instruction *NewSI = nullptr; 2401 Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); 2402 if (CreateGatherScatter) { 2403 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; 2404 Value *VectorGep = getOrCreateVectorValue(Ptr, Part); 2405 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2406 MaskPart); 2407 } else { 2408 if (Reverse) { 2409 // If we store to reverse consecutive memory locations, then we need 2410 // to reverse the order of elements in the stored value. 2411 StoredVal = reverseVector(StoredVal); 2412 // We don't want to update the value in the map as it might be used in 2413 // another expression. So don't call resetVectorValue(StoredVal). 2414 } 2415 auto *VecPtr = CreateVecPtr(Part, Ptr); 2416 if (isMaskRequired) 2417 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2418 Mask[Part]); 2419 else 2420 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2421 } 2422 addMetadata(NewSI, SI); 2423 } 2424 return; 2425 } 2426 2427 // Handle loads. 2428 assert(LI && "Must have a load instruction"); 2429 setDebugLocFromInst(Builder, LI); 2430 for (unsigned Part = 0; Part < UF; ++Part) { 2431 Value *NewLI; 2432 if (CreateGatherScatter) { 2433 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; 2434 Value *VectorGep = getOrCreateVectorValue(Ptr, Part); 2435 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2436 nullptr, "wide.masked.gather"); 2437 addMetadata(NewLI, LI); 2438 } else { 2439 auto *VecPtr = CreateVecPtr(Part, Ptr); 2440 if (isMaskRequired) 2441 NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], 2442 UndefValue::get(DataTy), 2443 "wide.masked.load"); 2444 else 2445 NewLI = 2446 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2447 2448 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2449 addMetadata(NewLI, LI); 2450 if (Reverse) 2451 NewLI = reverseVector(NewLI); 2452 } 2453 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2454 } 2455 } 2456 2457 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2458 const VPIteration &Instance, 2459 bool IfPredicateInstr) { 2460 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2461 2462 setDebugLocFromInst(Builder, Instr); 2463 2464 // Does this instruction return a value ? 2465 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2466 2467 Instruction *Cloned = Instr->clone(); 2468 if (!IsVoidRetTy) 2469 Cloned->setName(Instr->getName() + ".cloned"); 2470 2471 // Replace the operands of the cloned instructions with their scalar 2472 // equivalents in the new loop. 2473 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2474 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); 2475 Cloned->setOperand(op, NewOp); 2476 } 2477 addNewMetadata(Cloned, Instr); 2478 2479 // Place the cloned scalar in the new loop. 2480 Builder.Insert(Cloned); 2481 2482 // Add the cloned scalar to the scalar map entry. 2483 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2484 2485 // If we just cloned a new assumption, add it the assumption cache. 2486 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2487 if (II->getIntrinsicID() == Intrinsic::assume) 2488 AC->registerAssumption(II); 2489 2490 // End if-block. 2491 if (IfPredicateInstr) 2492 PredicatedInstructions.push_back(Cloned); 2493 } 2494 2495 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2496 Value *End, Value *Step, 2497 Instruction *DL) { 2498 BasicBlock *Header = L->getHeader(); 2499 BasicBlock *Latch = L->getLoopLatch(); 2500 // As we're just creating this loop, it's possible no latch exists 2501 // yet. If so, use the header as this will be a single block loop. 2502 if (!Latch) 2503 Latch = Header; 2504 2505 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2506 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2507 setDebugLocFromInst(Builder, OldInst); 2508 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2509 2510 Builder.SetInsertPoint(Latch->getTerminator()); 2511 setDebugLocFromInst(Builder, OldInst); 2512 2513 // Create i+1 and fill the PHINode. 2514 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2515 Induction->addIncoming(Start, L->getLoopPreheader()); 2516 Induction->addIncoming(Next, Latch); 2517 // Create the compare. 2518 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2519 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2520 2521 // Now we have two terminators. Remove the old one from the block. 2522 Latch->getTerminator()->eraseFromParent(); 2523 2524 return Induction; 2525 } 2526 2527 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2528 if (TripCount) 2529 return TripCount; 2530 2531 assert(L && "Create Trip Count for null loop."); 2532 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2533 // Find the loop boundaries. 2534 ScalarEvolution *SE = PSE.getSE(); 2535 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2536 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2537 "Invalid loop count"); 2538 2539 Type *IdxTy = Legal->getWidestInductionType(); 2540 assert(IdxTy && "No type for induction"); 2541 2542 // The exit count might have the type of i64 while the phi is i32. This can 2543 // happen if we have an induction variable that is sign extended before the 2544 // compare. The only way that we get a backedge taken count is that the 2545 // induction variable was signed and as such will not overflow. In such a case 2546 // truncation is legal. 2547 if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > 2548 IdxTy->getPrimitiveSizeInBits()) 2549 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2550 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2551 2552 // Get the total trip count from the count by adding 1. 2553 const SCEV *ExitCount = SE->getAddExpr( 2554 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2555 2556 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2557 2558 // Expand the trip count and place the new instructions in the preheader. 2559 // Notice that the pre-header does not change, only the loop body. 2560 SCEVExpander Exp(*SE, DL, "induction"); 2561 2562 // Count holds the overall loop count (N). 2563 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2564 L->getLoopPreheader()->getTerminator()); 2565 2566 if (TripCount->getType()->isPointerTy()) 2567 TripCount = 2568 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2569 L->getLoopPreheader()->getTerminator()); 2570 2571 return TripCount; 2572 } 2573 2574 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2575 if (VectorTripCount) 2576 return VectorTripCount; 2577 2578 Value *TC = getOrCreateTripCount(L); 2579 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2580 2581 Type *Ty = TC->getType(); 2582 Constant *Step = ConstantInt::get(Ty, VF * UF); 2583 2584 // If the tail is to be folded by masking, round the number of iterations N 2585 // up to a multiple of Step instead of rounding down. This is done by first 2586 // adding Step-1 and then rounding down. Note that it's ok if this addition 2587 // overflows: the vector induction variable will eventually wrap to zero given 2588 // that it starts at zero and its Step is a power of two; the loop will then 2589 // exit, with the last early-exit vector comparison also producing all-true. 2590 if (Cost->foldTailByMasking()) { 2591 assert(isPowerOf2_32(VF * UF) && 2592 "VF*UF must be a power of 2 when folding tail by masking"); 2593 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2594 } 2595 2596 // Now we need to generate the expression for the part of the loop that the 2597 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2598 // iterations are not required for correctness, or N - Step, otherwise. Step 2599 // is equal to the vectorization factor (number of SIMD elements) times the 2600 // unroll factor (number of SIMD instructions). 2601 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2602 2603 // If there is a non-reversed interleaved group that may speculatively access 2604 // memory out-of-bounds, we need to ensure that there will be at least one 2605 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2606 // the trip count, we set the remainder to be equal to the step. If the step 2607 // does not evenly divide the trip count, no adjustment is necessary since 2608 // there will already be scalar iterations. Note that the minimum iterations 2609 // check ensures that N >= Step. 2610 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2611 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2612 R = Builder.CreateSelect(IsZero, Step, R); 2613 } 2614 2615 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2616 2617 return VectorTripCount; 2618 } 2619 2620 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2621 const DataLayout &DL) { 2622 // Verify that V is a vector type with same number of elements as DstVTy. 2623 unsigned VF = DstVTy->getNumElements(); 2624 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2625 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2626 Type *SrcElemTy = SrcVecTy->getElementType(); 2627 Type *DstElemTy = DstVTy->getElementType(); 2628 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2629 "Vector elements must have same size"); 2630 2631 // Do a direct cast if element types are castable. 2632 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2633 return Builder.CreateBitOrPointerCast(V, DstVTy); 2634 } 2635 // V cannot be directly casted to desired vector type. 2636 // May happen when V is a floating point vector but DstVTy is a vector of 2637 // pointers or vice-versa. Handle this using a two-step bitcast using an 2638 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2639 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2640 "Only one type should be a pointer type"); 2641 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2642 "Only one type should be a floating point type"); 2643 Type *IntTy = 2644 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2645 VectorType *VecIntTy = VectorType::get(IntTy, VF); 2646 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2647 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2648 } 2649 2650 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2651 BasicBlock *Bypass) { 2652 Value *Count = getOrCreateTripCount(L); 2653 BasicBlock *BB = L->getLoopPreheader(); 2654 IRBuilder<> Builder(BB->getTerminator()); 2655 2656 // Generate code to check if the loop's trip count is less than VF * UF, or 2657 // equal to it in case a scalar epilogue is required; this implies that the 2658 // vector trip count is zero. This check also covers the case where adding one 2659 // to the backedge-taken count overflowed leading to an incorrect trip count 2660 // of zero. In this case we will also jump to the scalar loop. 2661 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2662 : ICmpInst::ICMP_ULT; 2663 2664 // If tail is to be folded, vector loop takes care of all iterations. 2665 Value *CheckMinIters = Builder.getFalse(); 2666 if (!Cost->foldTailByMasking()) 2667 CheckMinIters = Builder.CreateICmp( 2668 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2669 "min.iters.check"); 2670 2671 BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2672 // Update dominator tree immediately if the generated block is a 2673 // LoopBypassBlock because SCEV expansions to generate loop bypass 2674 // checks may query it before the current function is finished. 2675 DT->addNewBlock(NewBB, BB); 2676 if (L->getParentLoop()) 2677 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2678 ReplaceInstWithInst(BB->getTerminator(), 2679 BranchInst::Create(Bypass, NewBB, CheckMinIters)); 2680 LoopBypassBlocks.push_back(BB); 2681 } 2682 2683 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2684 BasicBlock *BB = L->getLoopPreheader(); 2685 2686 // Generate the code to check that the SCEV assumptions that we made. 2687 // We want the new basic block to start at the first instruction in a 2688 // sequence of instructions that form a check. 2689 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2690 "scev.check"); 2691 Value *SCEVCheck = 2692 Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); 2693 2694 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2695 if (C->isZero()) 2696 return; 2697 2698 assert(!BB->getParent()->hasOptSize() && 2699 "Cannot SCEV check stride or overflow when optimizing for size"); 2700 2701 // Create a new block containing the stride check. 2702 BB->setName("vector.scevcheck"); 2703 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2704 // Update dominator tree immediately if the generated block is a 2705 // LoopBypassBlock because SCEV expansions to generate loop bypass 2706 // checks may query it before the current function is finished. 2707 DT->addNewBlock(NewBB, BB); 2708 if (L->getParentLoop()) 2709 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2710 ReplaceInstWithInst(BB->getTerminator(), 2711 BranchInst::Create(Bypass, NewBB, SCEVCheck)); 2712 LoopBypassBlocks.push_back(BB); 2713 AddedSafetyChecks = true; 2714 } 2715 2716 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2717 // VPlan-native path does not do any analysis for runtime checks currently. 2718 if (EnableVPlanNativePath) 2719 return; 2720 2721 BasicBlock *BB = L->getLoopPreheader(); 2722 2723 // Generate the code that checks in runtime if arrays overlap. We put the 2724 // checks into a separate block to make the more common case of few elements 2725 // faster. 2726 Instruction *FirstCheckInst; 2727 Instruction *MemRuntimeCheck; 2728 std::tie(FirstCheckInst, MemRuntimeCheck) = 2729 Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); 2730 if (!MemRuntimeCheck) 2731 return; 2732 2733 assert(!BB->getParent()->hasOptSize() && 2734 "Cannot emit memory checks when optimizing for size"); 2735 2736 // Create a new block containing the memory check. 2737 BB->setName("vector.memcheck"); 2738 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2739 // Update dominator tree immediately if the generated block is a 2740 // LoopBypassBlock because SCEV expansions to generate loop bypass 2741 // checks may query it before the current function is finished. 2742 DT->addNewBlock(NewBB, BB); 2743 if (L->getParentLoop()) 2744 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2745 ReplaceInstWithInst(BB->getTerminator(), 2746 BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); 2747 LoopBypassBlocks.push_back(BB); 2748 AddedSafetyChecks = true; 2749 2750 // We currently don't use LoopVersioning for the actual loop cloning but we 2751 // still use it to add the noalias metadata. 2752 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2753 PSE.getSE()); 2754 LVer->prepareNoAliasMetadata(); 2755 } 2756 2757 Value *InnerLoopVectorizer::emitTransformedIndex( 2758 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2759 const InductionDescriptor &ID) const { 2760 2761 SCEVExpander Exp(*SE, DL, "induction"); 2762 auto Step = ID.getStep(); 2763 auto StartValue = ID.getStartValue(); 2764 assert(Index->getType() == Step->getType() && 2765 "Index type does not match StepValue type"); 2766 2767 // Note: the IR at this point is broken. We cannot use SE to create any new 2768 // SCEV and then expand it, hoping that SCEV's simplification will give us 2769 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2770 // lead to various SCEV crashes. So all we can do is to use builder and rely 2771 // on InstCombine for future simplifications. Here we handle some trivial 2772 // cases only. 2773 auto CreateAdd = [&B](Value *X, Value *Y) { 2774 assert(X->getType() == Y->getType() && "Types don't match!"); 2775 if (auto *CX = dyn_cast<ConstantInt>(X)) 2776 if (CX->isZero()) 2777 return Y; 2778 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2779 if (CY->isZero()) 2780 return X; 2781 return B.CreateAdd(X, Y); 2782 }; 2783 2784 auto CreateMul = [&B](Value *X, Value *Y) { 2785 assert(X->getType() == Y->getType() && "Types don't match!"); 2786 if (auto *CX = dyn_cast<ConstantInt>(X)) 2787 if (CX->isOne()) 2788 return Y; 2789 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2790 if (CY->isOne()) 2791 return X; 2792 return B.CreateMul(X, Y); 2793 }; 2794 2795 switch (ID.getKind()) { 2796 case InductionDescriptor::IK_IntInduction: { 2797 assert(Index->getType() == StartValue->getType() && 2798 "Index type does not match StartValue type"); 2799 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2800 return B.CreateSub(StartValue, Index); 2801 auto *Offset = CreateMul( 2802 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2803 return CreateAdd(StartValue, Offset); 2804 } 2805 case InductionDescriptor::IK_PtrInduction: { 2806 assert(isa<SCEVConstant>(Step) && 2807 "Expected constant step for pointer induction"); 2808 return B.CreateGEP( 2809 StartValue->getType()->getPointerElementType(), StartValue, 2810 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2811 &*B.GetInsertPoint()))); 2812 } 2813 case InductionDescriptor::IK_FpInduction: { 2814 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2815 auto InductionBinOp = ID.getInductionBinOp(); 2816 assert(InductionBinOp && 2817 (InductionBinOp->getOpcode() == Instruction::FAdd || 2818 InductionBinOp->getOpcode() == Instruction::FSub) && 2819 "Original bin op should be defined for FP induction"); 2820 2821 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2822 2823 // Floating point operations had to be 'fast' to enable the induction. 2824 FastMathFlags Flags; 2825 Flags.setFast(); 2826 2827 Value *MulExp = B.CreateFMul(StepValue, Index); 2828 if (isa<Instruction>(MulExp)) 2829 // We have to check, the MulExp may be a constant. 2830 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2831 2832 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2833 "induction"); 2834 if (isa<Instruction>(BOp)) 2835 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2836 2837 return BOp; 2838 } 2839 case InductionDescriptor::IK_NoInduction: 2840 return nullptr; 2841 } 2842 llvm_unreachable("invalid enum"); 2843 } 2844 2845 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2846 /* 2847 In this function we generate a new loop. The new loop will contain 2848 the vectorized instructions while the old loop will continue to run the 2849 scalar remainder. 2850 2851 [ ] <-- loop iteration number check. 2852 / | 2853 / v 2854 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2855 | / | 2856 | / v 2857 || [ ] <-- vector pre header. 2858 |/ | 2859 | v 2860 | [ ] \ 2861 | [ ]_| <-- vector loop. 2862 | | 2863 | v 2864 | -[ ] <--- middle-block. 2865 | / | 2866 | / v 2867 -|- >[ ] <--- new preheader. 2868 | | 2869 | v 2870 | [ ] \ 2871 | [ ]_| <-- old scalar loop to handle remainder. 2872 \ | 2873 \ v 2874 >[ ] <-- exit block. 2875 ... 2876 */ 2877 2878 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 2879 BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); 2880 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 2881 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2882 assert(VectorPH && "Invalid loop structure"); 2883 assert(ExitBlock && "Must have an exit block"); 2884 2885 // Some loops have a single integer induction variable, while other loops 2886 // don't. One example is c++ iterators that often have multiple pointer 2887 // induction variables. In the code below we also support a case where we 2888 // don't have a single induction variable. 2889 // 2890 // We try to obtain an induction variable from the original loop as hard 2891 // as possible. However if we don't find one that: 2892 // - is an integer 2893 // - counts from zero, stepping by one 2894 // - is the size of the widest induction variable type 2895 // then we create a new one. 2896 OldInduction = Legal->getPrimaryInduction(); 2897 Type *IdxTy = Legal->getWidestInductionType(); 2898 2899 // Split the single block loop into the two loop structure described above. 2900 BasicBlock *VecBody = 2901 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 2902 BasicBlock *MiddleBlock = 2903 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 2904 BasicBlock *ScalarPH = 2905 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 2906 2907 // Create and register the new vector loop. 2908 Loop *Lp = LI->AllocateLoop(); 2909 Loop *ParentLoop = OrigLoop->getParentLoop(); 2910 2911 // Insert the new loop into the loop nest and register the new basic blocks 2912 // before calling any utilities such as SCEV that require valid LoopInfo. 2913 if (ParentLoop) { 2914 ParentLoop->addChildLoop(Lp); 2915 ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); 2916 ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); 2917 } else { 2918 LI->addTopLevelLoop(Lp); 2919 } 2920 Lp->addBasicBlockToLoop(VecBody, *LI); 2921 2922 // Find the loop boundaries. 2923 Value *Count = getOrCreateTripCount(Lp); 2924 2925 Value *StartIdx = ConstantInt::get(IdxTy, 0); 2926 2927 // Now, compare the new count to zero. If it is zero skip the vector loop and 2928 // jump to the scalar loop. This check also covers the case where the 2929 // backedge-taken count is uint##_max: adding one to it will overflow leading 2930 // to an incorrect trip count of zero. In this (rare) case we will also jump 2931 // to the scalar loop. 2932 emitMinimumIterationCountCheck(Lp, ScalarPH); 2933 2934 // Generate the code to check any assumptions that we've made for SCEV 2935 // expressions. 2936 emitSCEVChecks(Lp, ScalarPH); 2937 2938 // Generate the code that checks in runtime if arrays overlap. We put the 2939 // checks into a separate block to make the more common case of few elements 2940 // faster. 2941 emitMemRuntimeChecks(Lp, ScalarPH); 2942 2943 // Generate the induction variable. 2944 // The loop step is equal to the vectorization factor (num of SIMD elements) 2945 // times the unroll factor (num of SIMD instructions). 2946 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 2947 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 2948 Induction = 2949 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 2950 getDebugLocFromInstOrOperands(OldInduction)); 2951 2952 // We are going to resume the execution of the scalar loop. 2953 // Go over all of the induction variables that we found and fix the 2954 // PHIs that are left in the scalar version of the loop. 2955 // The starting values of PHI nodes depend on the counter of the last 2956 // iteration in the vectorized loop. 2957 // If we come from a bypass edge then we need to start from the original 2958 // start value. 2959 2960 // This variable saves the new starting index for the scalar loop. It is used 2961 // to test if there are any tail iterations left once the vector loop has 2962 // completed. 2963 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 2964 for (auto &InductionEntry : *List) { 2965 PHINode *OrigPhi = InductionEntry.first; 2966 InductionDescriptor II = InductionEntry.second; 2967 2968 // Create phi nodes to merge from the backedge-taken check block. 2969 PHINode *BCResumeVal = PHINode::Create( 2970 OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); 2971 // Copy original phi DL over to the new one. 2972 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 2973 Value *&EndValue = IVEndValues[OrigPhi]; 2974 if (OrigPhi == OldInduction) { 2975 // We know what the end value is. 2976 EndValue = CountRoundDown; 2977 } else { 2978 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 2979 Type *StepType = II.getStep()->getType(); 2980 Instruction::CastOps CastOp = 2981 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 2982 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 2983 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2984 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 2985 EndValue->setName("ind.end"); 2986 } 2987 2988 // The new PHI merges the original incoming value, in case of a bypass, 2989 // or the value at the end of the vectorized loop. 2990 BCResumeVal->addIncoming(EndValue, MiddleBlock); 2991 2992 // Fix the scalar body counter (PHI node). 2993 // The old induction's phi node in the scalar body needs the truncated 2994 // value. 2995 for (BasicBlock *BB : LoopBypassBlocks) 2996 BCResumeVal->addIncoming(II.getStartValue(), BB); 2997 OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal); 2998 } 2999 3000 // We need the OrigLoop (scalar loop part) latch terminator to help 3001 // produce correct debug info for the middle block BB instructions. 3002 // The legality check stage guarantees that the loop will have a single 3003 // latch. 3004 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3005 "Scalar loop latch terminator isn't a branch"); 3006 BranchInst *ScalarLatchBr = 3007 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3008 3009 // Add a check in the middle block to see if we have completed 3010 // all of the iterations in the first vector loop. 3011 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3012 // If tail is to be folded, we know we don't need to run the remainder. 3013 Value *CmpN = Builder.getTrue(); 3014 if (!Cost->foldTailByMasking()) { 3015 CmpN = 3016 CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3017 CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); 3018 3019 // Here we use the same DebugLoc as the scalar loop latch branch instead 3020 // of the corresponding compare because they may have ended up with 3021 // different line numbers and we want to avoid awkward line stepping while 3022 // debugging. Eg. if the compare has got a line number inside the loop. 3023 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3024 } 3025 3026 BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN); 3027 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3028 ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst); 3029 3030 // Get ready to start creating new instructions into the vectorized body. 3031 Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); 3032 3033 // Save the state. 3034 LoopVectorPreHeader = Lp->getLoopPreheader(); 3035 LoopScalarPreHeader = ScalarPH; 3036 LoopMiddleBlock = MiddleBlock; 3037 LoopExitBlock = ExitBlock; 3038 LoopVectorBody = VecBody; 3039 LoopScalarBody = OldBasicBlock; 3040 3041 Optional<MDNode *> VectorizedLoopID = 3042 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3043 LLVMLoopVectorizeFollowupVectorized}); 3044 if (VectorizedLoopID.hasValue()) { 3045 Lp->setLoopID(VectorizedLoopID.getValue()); 3046 3047 // Do not setAlreadyVectorized if loop attributes have been defined 3048 // explicitly. 3049 return LoopVectorPreHeader; 3050 } 3051 3052 // Keep all loop hints from the original loop on the vector loop (we'll 3053 // replace the vectorizer-specific hints below). 3054 if (MDNode *LID = OrigLoop->getLoopID()) 3055 Lp->setLoopID(LID); 3056 3057 LoopVectorizeHints Hints(Lp, true, *ORE); 3058 Hints.setAlreadyVectorized(); 3059 3060 return LoopVectorPreHeader; 3061 } 3062 3063 // Fix up external users of the induction variable. At this point, we are 3064 // in LCSSA form, with all external PHIs that use the IV having one input value, 3065 // coming from the remainder loop. We need those PHIs to also have a correct 3066 // value for the IV when arriving directly from the middle block. 3067 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3068 const InductionDescriptor &II, 3069 Value *CountRoundDown, Value *EndValue, 3070 BasicBlock *MiddleBlock) { 3071 // There are two kinds of external IV usages - those that use the value 3072 // computed in the last iteration (the PHI) and those that use the penultimate 3073 // value (the value that feeds into the phi from the loop latch). 3074 // We allow both, but they, obviously, have different values. 3075 3076 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3077 3078 DenseMap<Value *, Value *> MissingVals; 3079 3080 // An external user of the last iteration's value should see the value that 3081 // the remainder loop uses to initialize its own IV. 3082 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3083 for (User *U : PostInc->users()) { 3084 Instruction *UI = cast<Instruction>(U); 3085 if (!OrigLoop->contains(UI)) { 3086 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3087 MissingVals[UI] = EndValue; 3088 } 3089 } 3090 3091 // An external user of the penultimate value need to see EndValue - Step. 3092 // The simplest way to get this is to recompute it from the constituent SCEVs, 3093 // that is Start + (Step * (CRD - 1)). 3094 for (User *U : OrigPhi->users()) { 3095 auto *UI = cast<Instruction>(U); 3096 if (!OrigLoop->contains(UI)) { 3097 const DataLayout &DL = 3098 OrigLoop->getHeader()->getModule()->getDataLayout(); 3099 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3100 3101 IRBuilder<> B(MiddleBlock->getTerminator()); 3102 Value *CountMinusOne = B.CreateSub( 3103 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3104 Value *CMO = 3105 !II.getStep()->getType()->isIntegerTy() 3106 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3107 II.getStep()->getType()) 3108 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3109 CMO->setName("cast.cmo"); 3110 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3111 Escape->setName("ind.escape"); 3112 MissingVals[UI] = Escape; 3113 } 3114 } 3115 3116 for (auto &I : MissingVals) { 3117 PHINode *PHI = cast<PHINode>(I.first); 3118 // One corner case we have to handle is two IVs "chasing" each-other, 3119 // that is %IV2 = phi [...], [ %IV1, %latch ] 3120 // In this case, if IV1 has an external use, we need to avoid adding both 3121 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3122 // don't already have an incoming value for the middle block. 3123 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3124 PHI->addIncoming(I.second, MiddleBlock); 3125 } 3126 } 3127 3128 namespace { 3129 3130 struct CSEDenseMapInfo { 3131 static bool canHandle(const Instruction *I) { 3132 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3133 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3134 } 3135 3136 static inline Instruction *getEmptyKey() { 3137 return DenseMapInfo<Instruction *>::getEmptyKey(); 3138 } 3139 3140 static inline Instruction *getTombstoneKey() { 3141 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3142 } 3143 3144 static unsigned getHashValue(const Instruction *I) { 3145 assert(canHandle(I) && "Unknown instruction!"); 3146 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3147 I->value_op_end())); 3148 } 3149 3150 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3151 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3152 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3153 return LHS == RHS; 3154 return LHS->isIdenticalTo(RHS); 3155 } 3156 }; 3157 3158 } // end anonymous namespace 3159 3160 ///Perform cse of induction variable instructions. 3161 static void cse(BasicBlock *BB) { 3162 // Perform simple cse. 3163 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3164 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3165 Instruction *In = &*I++; 3166 3167 if (!CSEDenseMapInfo::canHandle(In)) 3168 continue; 3169 3170 // Check if we can replace this instruction with any of the 3171 // visited instructions. 3172 if (Instruction *V = CSEMap.lookup(In)) { 3173 In->replaceAllUsesWith(V); 3174 In->eraseFromParent(); 3175 continue; 3176 } 3177 3178 CSEMap[In] = In; 3179 } 3180 } 3181 3182 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3183 unsigned VF, 3184 bool &NeedToScalarize) { 3185 Function *F = CI->getCalledFunction(); 3186 StringRef FnName = CI->getCalledFunction()->getName(); 3187 Type *ScalarRetTy = CI->getType(); 3188 SmallVector<Type *, 4> Tys, ScalarTys; 3189 for (auto &ArgOp : CI->arg_operands()) 3190 ScalarTys.push_back(ArgOp->getType()); 3191 3192 // Estimate cost of scalarized vector call. The source operands are assumed 3193 // to be vectors, so we need to extract individual elements from there, 3194 // execute VF scalar calls, and then gather the result into the vector return 3195 // value. 3196 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3197 if (VF == 1) 3198 return ScalarCallCost; 3199 3200 // Compute corresponding vector type for return value and arguments. 3201 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3202 for (Type *ScalarTy : ScalarTys) 3203 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3204 3205 // Compute costs of unpacking argument values for the scalar calls and 3206 // packing the return values to a vector. 3207 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3208 3209 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3210 3211 // If we can't emit a vector call for this function, then the currently found 3212 // cost is the cost we need to return. 3213 NeedToScalarize = true; 3214 if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) 3215 return Cost; 3216 3217 // If the corresponding vector cost is cheaper, return its cost. 3218 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3219 if (VectorCallCost < Cost) { 3220 NeedToScalarize = false; 3221 return VectorCallCost; 3222 } 3223 return Cost; 3224 } 3225 3226 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3227 unsigned VF) { 3228 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3229 assert(ID && "Expected intrinsic call!"); 3230 3231 FastMathFlags FMF; 3232 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3233 FMF = FPMO->getFastMathFlags(); 3234 3235 SmallVector<Value *, 4> Operands(CI->arg_operands()); 3236 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF); 3237 } 3238 3239 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3240 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3241 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3242 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3243 } 3244 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3245 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3246 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3247 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3248 } 3249 3250 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3251 // For every instruction `I` in MinBWs, truncate the operands, create a 3252 // truncated version of `I` and reextend its result. InstCombine runs 3253 // later and will remove any ext/trunc pairs. 3254 SmallPtrSet<Value *, 4> Erased; 3255 for (const auto &KV : Cost->getMinimalBitwidths()) { 3256 // If the value wasn't vectorized, we must maintain the original scalar 3257 // type. The absence of the value from VectorLoopValueMap indicates that it 3258 // wasn't vectorized. 3259 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3260 continue; 3261 for (unsigned Part = 0; Part < UF; ++Part) { 3262 Value *I = getOrCreateVectorValue(KV.first, Part); 3263 if (Erased.find(I) != Erased.end() || I->use_empty() || 3264 !isa<Instruction>(I)) 3265 continue; 3266 Type *OriginalTy = I->getType(); 3267 Type *ScalarTruncatedTy = 3268 IntegerType::get(OriginalTy->getContext(), KV.second); 3269 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, 3270 OriginalTy->getVectorNumElements()); 3271 if (TruncatedTy == OriginalTy) 3272 continue; 3273 3274 IRBuilder<> B(cast<Instruction>(I)); 3275 auto ShrinkOperand = [&](Value *V) -> Value * { 3276 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3277 if (ZI->getSrcTy() == TruncatedTy) 3278 return ZI->getOperand(0); 3279 return B.CreateZExtOrTrunc(V, TruncatedTy); 3280 }; 3281 3282 // The actual instruction modification depends on the instruction type, 3283 // unfortunately. 3284 Value *NewI = nullptr; 3285 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3286 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3287 ShrinkOperand(BO->getOperand(1))); 3288 3289 // Any wrapping introduced by shrinking this operation shouldn't be 3290 // considered undefined behavior. So, we can't unconditionally copy 3291 // arithmetic wrapping flags to NewI. 3292 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3293 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3294 NewI = 3295 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3296 ShrinkOperand(CI->getOperand(1))); 3297 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3298 NewI = B.CreateSelect(SI->getCondition(), 3299 ShrinkOperand(SI->getTrueValue()), 3300 ShrinkOperand(SI->getFalseValue())); 3301 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3302 switch (CI->getOpcode()) { 3303 default: 3304 llvm_unreachable("Unhandled cast!"); 3305 case Instruction::Trunc: 3306 NewI = ShrinkOperand(CI->getOperand(0)); 3307 break; 3308 case Instruction::SExt: 3309 NewI = B.CreateSExtOrTrunc( 3310 CI->getOperand(0), 3311 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3312 break; 3313 case Instruction::ZExt: 3314 NewI = B.CreateZExtOrTrunc( 3315 CI->getOperand(0), 3316 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3317 break; 3318 } 3319 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3320 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); 3321 auto *O0 = B.CreateZExtOrTrunc( 3322 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3323 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); 3324 auto *O1 = B.CreateZExtOrTrunc( 3325 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3326 3327 NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); 3328 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3329 // Don't do anything with the operands, just extend the result. 3330 continue; 3331 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3332 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements(); 3333 auto *O0 = B.CreateZExtOrTrunc( 3334 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3335 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3336 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3337 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3338 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements(); 3339 auto *O0 = B.CreateZExtOrTrunc( 3340 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3341 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3342 } else { 3343 // If we don't know what to do, be conservative and don't do anything. 3344 continue; 3345 } 3346 3347 // Lastly, extend the result. 3348 NewI->takeName(cast<Instruction>(I)); 3349 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3350 I->replaceAllUsesWith(Res); 3351 cast<Instruction>(I)->eraseFromParent(); 3352 Erased.insert(I); 3353 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3354 } 3355 } 3356 3357 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3358 for (const auto &KV : Cost->getMinimalBitwidths()) { 3359 // If the value wasn't vectorized, we must maintain the original scalar 3360 // type. The absence of the value from VectorLoopValueMap indicates that it 3361 // wasn't vectorized. 3362 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3363 continue; 3364 for (unsigned Part = 0; Part < UF; ++Part) { 3365 Value *I = getOrCreateVectorValue(KV.first, Part); 3366 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3367 if (Inst && Inst->use_empty()) { 3368 Value *NewI = Inst->getOperand(0); 3369 Inst->eraseFromParent(); 3370 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3371 } 3372 } 3373 } 3374 } 3375 3376 void InnerLoopVectorizer::fixVectorizedLoop() { 3377 // Insert truncates and extends for any truncated instructions as hints to 3378 // InstCombine. 3379 if (VF > 1) 3380 truncateToMinimalBitwidths(); 3381 3382 // Fix widened non-induction PHIs by setting up the PHI operands. 3383 if (OrigPHIsToFix.size()) { 3384 assert(EnableVPlanNativePath && 3385 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3386 fixNonInductionPHIs(); 3387 } 3388 3389 // At this point every instruction in the original loop is widened to a 3390 // vector form. Now we need to fix the recurrences in the loop. These PHI 3391 // nodes are currently empty because we did not want to introduce cycles. 3392 // This is the second stage of vectorizing recurrences. 3393 fixCrossIterationPHIs(); 3394 3395 // Update the dominator tree. 3396 // 3397 // FIXME: After creating the structure of the new loop, the dominator tree is 3398 // no longer up-to-date, and it remains that way until we update it 3399 // here. An out-of-date dominator tree is problematic for SCEV, 3400 // because SCEVExpander uses it to guide code generation. The 3401 // vectorizer use SCEVExpanders in several places. Instead, we should 3402 // keep the dominator tree up-to-date as we go. 3403 updateAnalysis(); 3404 3405 // Fix-up external users of the induction variables. 3406 for (auto &Entry : *Legal->getInductionVars()) 3407 fixupIVUsers(Entry.first, Entry.second, 3408 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3409 IVEndValues[Entry.first], LoopMiddleBlock); 3410 3411 fixLCSSAPHIs(); 3412 for (Instruction *PI : PredicatedInstructions) 3413 sinkScalarOperands(&*PI); 3414 3415 // Remove redundant induction instructions. 3416 cse(LoopVectorBody); 3417 } 3418 3419 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3420 // In order to support recurrences we need to be able to vectorize Phi nodes. 3421 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3422 // stage #2: We now need to fix the recurrences by adding incoming edges to 3423 // the currently empty PHI nodes. At this point every instruction in the 3424 // original loop is widened to a vector form so we can use them to construct 3425 // the incoming edges. 3426 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3427 // Handle first-order recurrences and reductions that need to be fixed. 3428 if (Legal->isFirstOrderRecurrence(&Phi)) 3429 fixFirstOrderRecurrence(&Phi); 3430 else if (Legal->isReductionVariable(&Phi)) 3431 fixReduction(&Phi); 3432 } 3433 } 3434 3435 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3436 // This is the second phase of vectorizing first-order recurrences. An 3437 // overview of the transformation is described below. Suppose we have the 3438 // following loop. 3439 // 3440 // for (int i = 0; i < n; ++i) 3441 // b[i] = a[i] - a[i - 1]; 3442 // 3443 // There is a first-order recurrence on "a". For this loop, the shorthand 3444 // scalar IR looks like: 3445 // 3446 // scalar.ph: 3447 // s_init = a[-1] 3448 // br scalar.body 3449 // 3450 // scalar.body: 3451 // i = phi [0, scalar.ph], [i+1, scalar.body] 3452 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3453 // s2 = a[i] 3454 // b[i] = s2 - s1 3455 // br cond, scalar.body, ... 3456 // 3457 // In this example, s1 is a recurrence because it's value depends on the 3458 // previous iteration. In the first phase of vectorization, we created a 3459 // temporary value for s1. We now complete the vectorization and produce the 3460 // shorthand vector IR shown below (for VF = 4, UF = 1). 3461 // 3462 // vector.ph: 3463 // v_init = vector(..., ..., ..., a[-1]) 3464 // br vector.body 3465 // 3466 // vector.body 3467 // i = phi [0, vector.ph], [i+4, vector.body] 3468 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3469 // v2 = a[i, i+1, i+2, i+3]; 3470 // v3 = vector(v1(3), v2(0, 1, 2)) 3471 // b[i, i+1, i+2, i+3] = v2 - v3 3472 // br cond, vector.body, middle.block 3473 // 3474 // middle.block: 3475 // x = v2(3) 3476 // br scalar.ph 3477 // 3478 // scalar.ph: 3479 // s_init = phi [x, middle.block], [a[-1], otherwise] 3480 // br scalar.body 3481 // 3482 // After execution completes the vector loop, we extract the next value of 3483 // the recurrence (x) to use as the initial value in the scalar loop. 3484 3485 // Get the original loop preheader and single loop latch. 3486 auto *Preheader = OrigLoop->getLoopPreheader(); 3487 auto *Latch = OrigLoop->getLoopLatch(); 3488 3489 // Get the initial and previous values of the scalar recurrence. 3490 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3491 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3492 3493 // Create a vector from the initial value. 3494 auto *VectorInit = ScalarInit; 3495 if (VF > 1) { 3496 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3497 VectorInit = Builder.CreateInsertElement( 3498 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3499 Builder.getInt32(VF - 1), "vector.recur.init"); 3500 } 3501 3502 // We constructed a temporary phi node in the first phase of vectorization. 3503 // This phi node will eventually be deleted. 3504 Builder.SetInsertPoint( 3505 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3506 3507 // Create a phi node for the new recurrence. The current value will either be 3508 // the initial value inserted into a vector or loop-varying vector value. 3509 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3510 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3511 3512 // Get the vectorized previous value of the last part UF - 1. It appears last 3513 // among all unrolled iterations, due to the order of their construction. 3514 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3515 3516 // Set the insertion point after the previous value if it is an instruction. 3517 // Note that the previous value may have been constant-folded so it is not 3518 // guaranteed to be an instruction in the vector loop. Also, if the previous 3519 // value is a phi node, we should insert after all the phi nodes to avoid 3520 // breaking basic block verification. 3521 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) || 3522 isa<PHINode>(PreviousLastPart)) 3523 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3524 else 3525 Builder.SetInsertPoint( 3526 &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart))); 3527 3528 // We will construct a vector for the recurrence by combining the values for 3529 // the current and previous iterations. This is the required shuffle mask. 3530 SmallVector<Constant *, 8> ShuffleMask(VF); 3531 ShuffleMask[0] = Builder.getInt32(VF - 1); 3532 for (unsigned I = 1; I < VF; ++I) 3533 ShuffleMask[I] = Builder.getInt32(I + VF - 1); 3534 3535 // The vector from which to take the initial value for the current iteration 3536 // (actual or unrolled). Initially, this is the vector phi node. 3537 Value *Incoming = VecPhi; 3538 3539 // Shuffle the current and previous vector and update the vector parts. 3540 for (unsigned Part = 0; Part < UF; ++Part) { 3541 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3542 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3543 auto *Shuffle = 3544 VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3545 ConstantVector::get(ShuffleMask)) 3546 : Incoming; 3547 PhiPart->replaceAllUsesWith(Shuffle); 3548 cast<Instruction>(PhiPart)->eraseFromParent(); 3549 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3550 Incoming = PreviousPart; 3551 } 3552 3553 // Fix the latch value of the new recurrence in the vector loop. 3554 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3555 3556 // Extract the last vector element in the middle block. This will be the 3557 // initial value for the recurrence when jumping to the scalar loop. 3558 auto *ExtractForScalar = Incoming; 3559 if (VF > 1) { 3560 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3561 ExtractForScalar = Builder.CreateExtractElement( 3562 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3563 } 3564 // Extract the second last element in the middle block if the 3565 // Phi is used outside the loop. We need to extract the phi itself 3566 // and not the last element (the phi update in the current iteration). This 3567 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3568 // when the scalar loop is not run at all. 3569 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3570 if (VF > 1) 3571 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3572 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3573 // When loop is unrolled without vectorizing, initialize 3574 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3575 // `Incoming`. This is analogous to the vectorized case above: extracting the 3576 // second last element when VF > 1. 3577 else if (UF > 1) 3578 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3579 3580 // Fix the initial value of the original recurrence in the scalar loop. 3581 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3582 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3583 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3584 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3585 Start->addIncoming(Incoming, BB); 3586 } 3587 3588 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3589 Phi->setName("scalar.recur"); 3590 3591 // Finally, fix users of the recurrence outside the loop. The users will need 3592 // either the last value of the scalar recurrence or the last value of the 3593 // vector recurrence we extracted in the middle block. Since the loop is in 3594 // LCSSA form, we just need to find all the phi nodes for the original scalar 3595 // recurrence in the exit block, and then add an edge for the middle block. 3596 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3597 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3598 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3599 } 3600 } 3601 } 3602 3603 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3604 Constant *Zero = Builder.getInt32(0); 3605 3606 // Get it's reduction variable descriptor. 3607 assert(Legal->isReductionVariable(Phi) && 3608 "Unable to find the reduction variable"); 3609 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; 3610 3611 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3612 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3613 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3614 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3615 RdxDesc.getMinMaxRecurrenceKind(); 3616 setDebugLocFromInst(Builder, ReductionStartValue); 3617 3618 // We need to generate a reduction vector from the incoming scalar. 3619 // To do so, we need to generate the 'identity' vector and override 3620 // one of the elements with the incoming scalar reduction. We need 3621 // to do it in the vector-loop preheader. 3622 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3623 3624 // This is the vector-clone of the value that leaves the loop. 3625 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3626 3627 // Find the reduction identity variable. Zero for addition, or, xor, 3628 // one for multiplication, -1 for And. 3629 Value *Identity; 3630 Value *VectorStart; 3631 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3632 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3633 // MinMax reduction have the start value as their identify. 3634 if (VF == 1) { 3635 VectorStart = Identity = ReductionStartValue; 3636 } else { 3637 VectorStart = Identity = 3638 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3639 } 3640 } else { 3641 // Handle other reduction kinds: 3642 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3643 RK, VecTy->getScalarType()); 3644 if (VF == 1) { 3645 Identity = Iden; 3646 // This vector is the Identity vector where the first element is the 3647 // incoming scalar reduction. 3648 VectorStart = ReductionStartValue; 3649 } else { 3650 Identity = ConstantVector::getSplat(VF, Iden); 3651 3652 // This vector is the Identity vector where the first element is the 3653 // incoming scalar reduction. 3654 VectorStart = 3655 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3656 } 3657 } 3658 3659 // Fix the vector-loop phi. 3660 3661 // Reductions do not have to start at zero. They can start with 3662 // any loop invariant values. 3663 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3664 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3665 for (unsigned Part = 0; Part < UF; ++Part) { 3666 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3667 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3668 // Make sure to add the reduction stat value only to the 3669 // first unroll part. 3670 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3671 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3672 cast<PHINode>(VecRdxPhi) 3673 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3674 } 3675 3676 // Before each round, move the insertion point right between 3677 // the PHIs and the values we are going to write. 3678 // This allows us to write both PHINodes and the extractelement 3679 // instructions. 3680 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3681 3682 setDebugLocFromInst(Builder, LoopExitInst); 3683 3684 // If tail is folded by masking, the vector value to leave the loop should be 3685 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3686 // instead of the former. 3687 if (Cost->foldTailByMasking()) { 3688 for (unsigned Part = 0; Part < UF; ++Part) { 3689 Value *VecLoopExitInst = 3690 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3691 Value *Sel = nullptr; 3692 for (User *U : VecLoopExitInst->users()) { 3693 if (isa<SelectInst>(U)) { 3694 assert(!Sel && "Reduction exit feeding two selects"); 3695 Sel = U; 3696 } else 3697 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3698 } 3699 assert(Sel && "Reduction exit feeds no select"); 3700 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3701 } 3702 } 3703 3704 // If the vector reduction can be performed in a smaller type, we truncate 3705 // then extend the loop exit value to enable InstCombine to evaluate the 3706 // entire expression in the smaller type. 3707 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3708 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3709 Builder.SetInsertPoint( 3710 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3711 VectorParts RdxParts(UF); 3712 for (unsigned Part = 0; Part < UF; ++Part) { 3713 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3714 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3715 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3716 : Builder.CreateZExt(Trunc, VecTy); 3717 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3718 UI != RdxParts[Part]->user_end();) 3719 if (*UI != Trunc) { 3720 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3721 RdxParts[Part] = Extnd; 3722 } else { 3723 ++UI; 3724 } 3725 } 3726 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3727 for (unsigned Part = 0; Part < UF; ++Part) { 3728 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3729 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3730 } 3731 } 3732 3733 // Reduce all of the unrolled parts into a single vector. 3734 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3735 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3736 3737 // The middle block terminator has already been assigned a DebugLoc here (the 3738 // OrigLoop's single latch terminator). We want the whole middle block to 3739 // appear to execute on this line because: (a) it is all compiler generated, 3740 // (b) these instructions are always executed after evaluating the latch 3741 // conditional branch, and (c) other passes may add new predecessors which 3742 // terminate on this line. This is the easiest way to ensure we don't 3743 // accidentally cause an extra step back into the loop while debugging. 3744 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3745 for (unsigned Part = 1; Part < UF; ++Part) { 3746 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3747 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3748 // Floating point operations had to be 'fast' to enable the reduction. 3749 ReducedPartRdx = addFastMathFlag( 3750 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3751 ReducedPartRdx, "bin.rdx"), 3752 RdxDesc.getFastMathFlags()); 3753 else 3754 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3755 RdxPart); 3756 } 3757 3758 if (VF > 1) { 3759 bool NoNaN = Legal->hasFunNoNaNAttr(); 3760 ReducedPartRdx = 3761 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3762 // If the reduction can be performed in a smaller type, we need to extend 3763 // the reduction to the wider type before we branch to the original loop. 3764 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3765 ReducedPartRdx = 3766 RdxDesc.isSigned() 3767 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3768 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3769 } 3770 3771 // Create a phi node that merges control-flow from the backedge-taken check 3772 // block and the middle block. 3773 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3774 LoopScalarPreHeader->getTerminator()); 3775 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3776 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3777 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3778 3779 // Now, we need to fix the users of the reduction variable 3780 // inside and outside of the scalar remainder loop. 3781 // We know that the loop is in LCSSA form. We need to update the 3782 // PHI nodes in the exit blocks. 3783 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3784 // All PHINodes need to have a single entry edge, or two if 3785 // we already fixed them. 3786 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3787 3788 // We found a reduction value exit-PHI. Update it with the 3789 // incoming bypass edge. 3790 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3791 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3792 } // end of the LCSSA phi scan. 3793 3794 // Fix the scalar loop reduction variable with the incoming reduction sum 3795 // from the vector body and from the backedge value. 3796 int IncomingEdgeBlockIdx = 3797 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3798 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3799 // Pick the other block. 3800 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3801 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3802 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3803 } 3804 3805 void InnerLoopVectorizer::fixLCSSAPHIs() { 3806 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3807 if (LCSSAPhi.getNumIncomingValues() == 1) { 3808 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3809 // Non-instruction incoming values will have only one value. 3810 unsigned LastLane = 0; 3811 if (isa<Instruction>(IncomingValue)) 3812 LastLane = Cost->isUniformAfterVectorization( 3813 cast<Instruction>(IncomingValue), VF) 3814 ? 0 3815 : VF - 1; 3816 // Can be a loop invariant incoming value or the last scalar value to be 3817 // extracted from the vectorized loop. 3818 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3819 Value *lastIncomingValue = 3820 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3821 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3822 } 3823 } 3824 } 3825 3826 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3827 // The basic block and loop containing the predicated instruction. 3828 auto *PredBB = PredInst->getParent(); 3829 auto *VectorLoop = LI->getLoopFor(PredBB); 3830 3831 // Initialize a worklist with the operands of the predicated instruction. 3832 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3833 3834 // Holds instructions that we need to analyze again. An instruction may be 3835 // reanalyzed if we don't yet know if we can sink it or not. 3836 SmallVector<Instruction *, 8> InstsToReanalyze; 3837 3838 // Returns true if a given use occurs in the predicated block. Phi nodes use 3839 // their operands in their corresponding predecessor blocks. 3840 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3841 auto *I = cast<Instruction>(U.getUser()); 3842 BasicBlock *BB = I->getParent(); 3843 if (auto *Phi = dyn_cast<PHINode>(I)) 3844 BB = Phi->getIncomingBlock( 3845 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3846 return BB == PredBB; 3847 }; 3848 3849 // Iteratively sink the scalarized operands of the predicated instruction 3850 // into the block we created for it. When an instruction is sunk, it's 3851 // operands are then added to the worklist. The algorithm ends after one pass 3852 // through the worklist doesn't sink a single instruction. 3853 bool Changed; 3854 do { 3855 // Add the instructions that need to be reanalyzed to the worklist, and 3856 // reset the changed indicator. 3857 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3858 InstsToReanalyze.clear(); 3859 Changed = false; 3860 3861 while (!Worklist.empty()) { 3862 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3863 3864 // We can't sink an instruction if it is a phi node, is already in the 3865 // predicated block, is not in the loop, or may have side effects. 3866 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 3867 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 3868 continue; 3869 3870 // It's legal to sink the instruction if all its uses occur in the 3871 // predicated block. Otherwise, there's nothing to do yet, and we may 3872 // need to reanalyze the instruction. 3873 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 3874 InstsToReanalyze.push_back(I); 3875 continue; 3876 } 3877 3878 // Move the instruction to the beginning of the predicated block, and add 3879 // it's operands to the worklist. 3880 I->moveBefore(&*PredBB->getFirstInsertionPt()); 3881 Worklist.insert(I->op_begin(), I->op_end()); 3882 3883 // The sinking may have enabled other instructions to be sunk, so we will 3884 // need to iterate. 3885 Changed = true; 3886 } 3887 } while (Changed); 3888 } 3889 3890 void InnerLoopVectorizer::fixNonInductionPHIs() { 3891 for (PHINode *OrigPhi : OrigPHIsToFix) { 3892 PHINode *NewPhi = 3893 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 3894 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 3895 3896 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 3897 predecessors(OrigPhi->getParent())); 3898 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 3899 predecessors(NewPhi->getParent())); 3900 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 3901 "Scalar and Vector BB should have the same number of predecessors"); 3902 3903 // The insertion point in Builder may be invalidated by the time we get 3904 // here. Force the Builder insertion point to something valid so that we do 3905 // not run into issues during insertion point restore in 3906 // getOrCreateVectorValue calls below. 3907 Builder.SetInsertPoint(NewPhi); 3908 3909 // The predecessor order is preserved and we can rely on mapping between 3910 // scalar and vector block predecessors. 3911 for (unsigned i = 0; i < NumIncomingValues; ++i) { 3912 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 3913 3914 // When looking up the new scalar/vector values to fix up, use incoming 3915 // values from original phi. 3916 Value *ScIncV = 3917 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 3918 3919 // Scalar incoming value may need a broadcast 3920 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 3921 NewPhi->addIncoming(NewIncV, NewPredBB); 3922 } 3923 } 3924 } 3925 3926 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 3927 unsigned VF) { 3928 PHINode *P = cast<PHINode>(PN); 3929 if (EnableVPlanNativePath) { 3930 // Currently we enter here in the VPlan-native path for non-induction 3931 // PHIs where all control flow is uniform. We simply widen these PHIs. 3932 // Create a vector phi with no operands - the vector phi operands will be 3933 // set at the end of vector code generation. 3934 Type *VecTy = 3935 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 3936 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 3937 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 3938 OrigPHIsToFix.push_back(P); 3939 3940 return; 3941 } 3942 3943 assert(PN->getParent() == OrigLoop->getHeader() && 3944 "Non-header phis should have been handled elsewhere"); 3945 3946 // In order to support recurrences we need to be able to vectorize Phi nodes. 3947 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3948 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 3949 // this value when we vectorize all of the instructions that use the PHI. 3950 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 3951 for (unsigned Part = 0; Part < UF; ++Part) { 3952 // This is phase one of vectorizing PHIs. 3953 Type *VecTy = 3954 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 3955 Value *EntryPart = PHINode::Create( 3956 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 3957 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 3958 } 3959 return; 3960 } 3961 3962 setDebugLocFromInst(Builder, P); 3963 3964 // This PHINode must be an induction variable. 3965 // Make sure that we know about it. 3966 assert(Legal->getInductionVars()->count(P) && "Not an induction variable"); 3967 3968 InductionDescriptor II = Legal->getInductionVars()->lookup(P); 3969 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 3970 3971 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 3972 // which can be found from the original scalar operations. 3973 switch (II.getKind()) { 3974 case InductionDescriptor::IK_NoInduction: 3975 llvm_unreachable("Unknown induction"); 3976 case InductionDescriptor::IK_IntInduction: 3977 case InductionDescriptor::IK_FpInduction: 3978 llvm_unreachable("Integer/fp induction is handled elsewhere."); 3979 case InductionDescriptor::IK_PtrInduction: { 3980 // Handle the pointer induction variable case. 3981 assert(P->getType()->isPointerTy() && "Unexpected type."); 3982 // This is the normalized GEP that starts counting at zero. 3983 Value *PtrInd = Induction; 3984 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 3985 // Determine the number of scalars we need to generate for each unroll 3986 // iteration. If the instruction is uniform, we only need to generate the 3987 // first lane. Otherwise, we generate all VF values. 3988 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 3989 // These are the scalar results. Notice that we don't generate vector GEPs 3990 // because scalar GEPs result in better code. 3991 for (unsigned Part = 0; Part < UF; ++Part) { 3992 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 3993 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 3994 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 3995 Value *SclrGep = 3996 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 3997 SclrGep->setName("next.gep"); 3998 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 3999 } 4000 } 4001 return; 4002 } 4003 } 4004 } 4005 4006 /// A helper function for checking whether an integer division-related 4007 /// instruction may divide by zero (in which case it must be predicated if 4008 /// executed conditionally in the scalar code). 4009 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4010 /// Non-zero divisors that are non compile-time constants will not be 4011 /// converted into multiplication, so we will still end up scalarizing 4012 /// the division, but can do so w/o predication. 4013 static bool mayDivideByZero(Instruction &I) { 4014 assert((I.getOpcode() == Instruction::UDiv || 4015 I.getOpcode() == Instruction::SDiv || 4016 I.getOpcode() == Instruction::URem || 4017 I.getOpcode() == Instruction::SRem) && 4018 "Unexpected instruction"); 4019 Value *Divisor = I.getOperand(1); 4020 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4021 return !CInt || CInt->isZero(); 4022 } 4023 4024 void InnerLoopVectorizer::widenInstruction(Instruction &I) { 4025 switch (I.getOpcode()) { 4026 case Instruction::Br: 4027 case Instruction::PHI: 4028 llvm_unreachable("This instruction is handled by a different recipe."); 4029 case Instruction::GetElementPtr: { 4030 // Construct a vector GEP by widening the operands of the scalar GEP as 4031 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4032 // results in a vector of pointers when at least one operand of the GEP 4033 // is vector-typed. Thus, to keep the representation compact, we only use 4034 // vector-typed operands for loop-varying values. 4035 auto *GEP = cast<GetElementPtrInst>(&I); 4036 4037 if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) { 4038 // If we are vectorizing, but the GEP has only loop-invariant operands, 4039 // the GEP we build (by only using vector-typed operands for 4040 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4041 // produce a vector of pointers, we need to either arbitrarily pick an 4042 // operand to broadcast, or broadcast a clone of the original GEP. 4043 // Here, we broadcast a clone of the original. 4044 // 4045 // TODO: If at some point we decide to scalarize instructions having 4046 // loop-invariant operands, this special case will no longer be 4047 // required. We would add the scalarization decision to 4048 // collectLoopScalars() and teach getVectorValue() to broadcast 4049 // the lane-zero scalar value. 4050 auto *Clone = Builder.Insert(GEP->clone()); 4051 for (unsigned Part = 0; Part < UF; ++Part) { 4052 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4053 VectorLoopValueMap.setVectorValue(&I, Part, EntryPart); 4054 addMetadata(EntryPart, GEP); 4055 } 4056 } else { 4057 // If the GEP has at least one loop-varying operand, we are sure to 4058 // produce a vector of pointers. But if we are only unrolling, we want 4059 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4060 // produce with the code below will be scalar (if VF == 1) or vector 4061 // (otherwise). Note that for the unroll-only case, we still maintain 4062 // values in the vector mapping with initVector, as we do for other 4063 // instructions. 4064 for (unsigned Part = 0; Part < UF; ++Part) { 4065 // The pointer operand of the new GEP. If it's loop-invariant, we 4066 // won't broadcast it. 4067 auto *Ptr = 4068 OrigLoop->isLoopInvariant(GEP->getPointerOperand()) 4069 ? GEP->getPointerOperand() 4070 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4071 4072 // Collect all the indices for the new GEP. If any index is 4073 // loop-invariant, we won't broadcast it. 4074 SmallVector<Value *, 4> Indices; 4075 for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) { 4076 if (OrigLoop->isLoopInvariant(U.get())) 4077 Indices.push_back(U.get()); 4078 else 4079 Indices.push_back(getOrCreateVectorValue(U.get(), Part)); 4080 } 4081 4082 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4083 // but it should be a vector, otherwise. 4084 auto *NewGEP = 4085 GEP->isInBounds() 4086 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4087 Indices) 4088 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4089 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4090 "NewGEP is not a pointer vector"); 4091 VectorLoopValueMap.setVectorValue(&I, Part, NewGEP); 4092 addMetadata(NewGEP, GEP); 4093 } 4094 } 4095 4096 break; 4097 } 4098 case Instruction::UDiv: 4099 case Instruction::SDiv: 4100 case Instruction::SRem: 4101 case Instruction::URem: 4102 case Instruction::Add: 4103 case Instruction::FAdd: 4104 case Instruction::Sub: 4105 case Instruction::FSub: 4106 case Instruction::FNeg: 4107 case Instruction::Mul: 4108 case Instruction::FMul: 4109 case Instruction::FDiv: 4110 case Instruction::FRem: 4111 case Instruction::Shl: 4112 case Instruction::LShr: 4113 case Instruction::AShr: 4114 case Instruction::And: 4115 case Instruction::Or: 4116 case Instruction::Xor: { 4117 // Just widen unops and binops. 4118 setDebugLocFromInst(Builder, &I); 4119 4120 for (unsigned Part = 0; Part < UF; ++Part) { 4121 SmallVector<Value *, 2> Ops; 4122 for (Value *Op : I.operands()) 4123 Ops.push_back(getOrCreateVectorValue(Op, Part)); 4124 4125 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4126 4127 if (auto *VecOp = dyn_cast<Instruction>(V)) 4128 VecOp->copyIRFlags(&I); 4129 4130 // Use this vector value for all users of the original instruction. 4131 VectorLoopValueMap.setVectorValue(&I, Part, V); 4132 addMetadata(V, &I); 4133 } 4134 4135 break; 4136 } 4137 case Instruction::Select: { 4138 // Widen selects. 4139 // If the selector is loop invariant we can create a select 4140 // instruction with a scalar condition. Otherwise, use vector-select. 4141 auto *SE = PSE.getSE(); 4142 bool InvariantCond = 4143 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); 4144 setDebugLocFromInst(Builder, &I); 4145 4146 // The condition can be loop invariant but still defined inside the 4147 // loop. This means that we can't just use the original 'cond' value. 4148 // We have to take the 'vectorized' value and pick the first lane. 4149 // Instcombine will make this a no-op. 4150 4151 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); 4152 4153 for (unsigned Part = 0; Part < UF; ++Part) { 4154 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); 4155 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); 4156 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); 4157 Value *Sel = 4158 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); 4159 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4160 addMetadata(Sel, &I); 4161 } 4162 4163 break; 4164 } 4165 4166 case Instruction::ICmp: 4167 case Instruction::FCmp: { 4168 // Widen compares. Generate vector compares. 4169 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4170 auto *Cmp = dyn_cast<CmpInst>(&I); 4171 setDebugLocFromInst(Builder, Cmp); 4172 for (unsigned Part = 0; Part < UF; ++Part) { 4173 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); 4174 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); 4175 Value *C = nullptr; 4176 if (FCmp) { 4177 // Propagate fast math flags. 4178 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4179 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4180 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4181 } else { 4182 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4183 } 4184 VectorLoopValueMap.setVectorValue(&I, Part, C); 4185 addMetadata(C, &I); 4186 } 4187 4188 break; 4189 } 4190 4191 case Instruction::ZExt: 4192 case Instruction::SExt: 4193 case Instruction::FPToUI: 4194 case Instruction::FPToSI: 4195 case Instruction::FPExt: 4196 case Instruction::PtrToInt: 4197 case Instruction::IntToPtr: 4198 case Instruction::SIToFP: 4199 case Instruction::UIToFP: 4200 case Instruction::Trunc: 4201 case Instruction::FPTrunc: 4202 case Instruction::BitCast: { 4203 auto *CI = dyn_cast<CastInst>(&I); 4204 setDebugLocFromInst(Builder, CI); 4205 4206 /// Vectorize casts. 4207 Type *DestTy = 4208 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4209 4210 for (unsigned Part = 0; Part < UF; ++Part) { 4211 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); 4212 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4213 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4214 addMetadata(Cast, &I); 4215 } 4216 break; 4217 } 4218 4219 case Instruction::Call: { 4220 // Ignore dbg intrinsics. 4221 if (isa<DbgInfoIntrinsic>(I)) 4222 break; 4223 setDebugLocFromInst(Builder, &I); 4224 4225 Module *M = I.getParent()->getParent()->getParent(); 4226 auto *CI = cast<CallInst>(&I); 4227 4228 StringRef FnName = CI->getCalledFunction()->getName(); 4229 Function *F = CI->getCalledFunction(); 4230 Type *RetTy = ToVectorTy(CI->getType(), VF); 4231 SmallVector<Type *, 4> Tys; 4232 for (Value *ArgOperand : CI->arg_operands()) 4233 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4234 4235 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4236 4237 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4238 // version of the instruction. 4239 // Is it beneficial to perform intrinsic call compared to lib call? 4240 bool NeedToScalarize; 4241 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4242 bool UseVectorIntrinsic = 4243 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4244 assert((UseVectorIntrinsic || !NeedToScalarize) && 4245 "Instruction should be scalarized elsewhere."); 4246 4247 for (unsigned Part = 0; Part < UF; ++Part) { 4248 SmallVector<Value *, 4> Args; 4249 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 4250 Value *Arg = CI->getArgOperand(i); 4251 // Some intrinsics have a scalar argument - don't replace it with a 4252 // vector. 4253 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) 4254 Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); 4255 Args.push_back(Arg); 4256 } 4257 4258 Function *VectorF; 4259 if (UseVectorIntrinsic) { 4260 // Use vector version of the intrinsic. 4261 Type *TysForDecl[] = {CI->getType()}; 4262 if (VF > 1) 4263 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4264 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4265 } else { 4266 // Use vector version of the library call. 4267 StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); 4268 assert(!VFnName.empty() && "Vector function name is empty."); 4269 VectorF = M->getFunction(VFnName); 4270 if (!VectorF) { 4271 // Generate a declaration 4272 FunctionType *FTy = FunctionType::get(RetTy, Tys, false); 4273 VectorF = 4274 Function::Create(FTy, Function::ExternalLinkage, VFnName, M); 4275 VectorF->copyAttributesFrom(F); 4276 } 4277 } 4278 assert(VectorF && "Can't create vector function."); 4279 4280 SmallVector<OperandBundleDef, 1> OpBundles; 4281 CI->getOperandBundlesAsDefs(OpBundles); 4282 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4283 4284 if (isa<FPMathOperator>(V)) 4285 V->copyFastMathFlags(CI); 4286 4287 VectorLoopValueMap.setVectorValue(&I, Part, V); 4288 addMetadata(V, &I); 4289 } 4290 4291 break; 4292 } 4293 4294 default: 4295 // This instruction is not vectorized by simple widening. 4296 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4297 llvm_unreachable("Unhandled instruction!"); 4298 } // end of switch. 4299 } 4300 4301 void InnerLoopVectorizer::updateAnalysis() { 4302 // Forget the original basic block. 4303 PSE.getSE()->forgetLoop(OrigLoop); 4304 4305 // DT is not kept up-to-date for outer loop vectorization 4306 if (EnableVPlanNativePath) 4307 return; 4308 4309 // Update the dominator tree information. 4310 assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && 4311 "Entry does not dominate exit."); 4312 4313 DT->addNewBlock(LoopMiddleBlock, 4314 LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4315 DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); 4316 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 4317 DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); 4318 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 4319 } 4320 4321 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4322 // We should not collect Scalars more than once per VF. Right now, this 4323 // function is called from collectUniformsAndScalars(), which already does 4324 // this check. Collecting Scalars for VF=1 does not make any sense. 4325 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4326 "This function should not be visited twice for the same VF"); 4327 4328 SmallSetVector<Instruction *, 8> Worklist; 4329 4330 // These sets are used to seed the analysis with pointers used by memory 4331 // accesses that will remain scalar. 4332 SmallSetVector<Instruction *, 8> ScalarPtrs; 4333 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4334 4335 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4336 // The pointer operands of loads and stores will be scalar as long as the 4337 // memory access is not a gather or scatter operation. The value operand of a 4338 // store will remain scalar if the store is scalarized. 4339 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4340 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4341 assert(WideningDecision != CM_Unknown && 4342 "Widening decision should be ready at this moment"); 4343 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4344 if (Ptr == Store->getValueOperand()) 4345 return WideningDecision == CM_Scalarize; 4346 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4347 "Ptr is neither a value or pointer operand"); 4348 return WideningDecision != CM_GatherScatter; 4349 }; 4350 4351 // A helper that returns true if the given value is a bitcast or 4352 // getelementptr instruction contained in the loop. 4353 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4354 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4355 isa<GetElementPtrInst>(V)) && 4356 !TheLoop->isLoopInvariant(V); 4357 }; 4358 4359 // A helper that evaluates a memory access's use of a pointer. If the use 4360 // will be a scalar use, and the pointer is only used by memory accesses, we 4361 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4362 // PossibleNonScalarPtrs. 4363 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4364 // We only care about bitcast and getelementptr instructions contained in 4365 // the loop. 4366 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4367 return; 4368 4369 // If the pointer has already been identified as scalar (e.g., if it was 4370 // also identified as uniform), there's nothing to do. 4371 auto *I = cast<Instruction>(Ptr); 4372 if (Worklist.count(I)) 4373 return; 4374 4375 // If the use of the pointer will be a scalar use, and all users of the 4376 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4377 // place the pointer in PossibleNonScalarPtrs. 4378 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4379 return isa<LoadInst>(U) || isa<StoreInst>(U); 4380 })) 4381 ScalarPtrs.insert(I); 4382 else 4383 PossibleNonScalarPtrs.insert(I); 4384 }; 4385 4386 // We seed the scalars analysis with three classes of instructions: (1) 4387 // instructions marked uniform-after-vectorization, (2) bitcast and 4388 // getelementptr instructions used by memory accesses requiring a scalar use, 4389 // and (3) pointer induction variables and their update instructions (we 4390 // currently only scalarize these). 4391 // 4392 // (1) Add to the worklist all instructions that have been identified as 4393 // uniform-after-vectorization. 4394 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4395 4396 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4397 // memory accesses requiring a scalar use. The pointer operands of loads and 4398 // stores will be scalar as long as the memory accesses is not a gather or 4399 // scatter operation. The value operand of a store will remain scalar if the 4400 // store is scalarized. 4401 for (auto *BB : TheLoop->blocks()) 4402 for (auto &I : *BB) { 4403 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4404 evaluatePtrUse(Load, Load->getPointerOperand()); 4405 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4406 evaluatePtrUse(Store, Store->getPointerOperand()); 4407 evaluatePtrUse(Store, Store->getValueOperand()); 4408 } 4409 } 4410 for (auto *I : ScalarPtrs) 4411 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { 4412 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4413 Worklist.insert(I); 4414 } 4415 4416 // (3) Add to the worklist all pointer induction variables and their update 4417 // instructions. 4418 // 4419 // TODO: Once we are able to vectorize pointer induction variables we should 4420 // no longer insert them into the worklist here. 4421 auto *Latch = TheLoop->getLoopLatch(); 4422 for (auto &Induction : *Legal->getInductionVars()) { 4423 auto *Ind = Induction.first; 4424 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4425 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4426 continue; 4427 Worklist.insert(Ind); 4428 Worklist.insert(IndUpdate); 4429 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4430 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4431 << "\n"); 4432 } 4433 4434 // Insert the forced scalars. 4435 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4436 // induction variable when the PHI user is scalarized. 4437 auto ForcedScalar = ForcedScalars.find(VF); 4438 if (ForcedScalar != ForcedScalars.end()) 4439 for (auto *I : ForcedScalar->second) 4440 Worklist.insert(I); 4441 4442 // Expand the worklist by looking through any bitcasts and getelementptr 4443 // instructions we've already identified as scalar. This is similar to the 4444 // expansion step in collectLoopUniforms(); however, here we're only 4445 // expanding to include additional bitcasts and getelementptr instructions. 4446 unsigned Idx = 0; 4447 while (Idx != Worklist.size()) { 4448 Instruction *Dst = Worklist[Idx++]; 4449 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4450 continue; 4451 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4452 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4453 auto *J = cast<Instruction>(U); 4454 return !TheLoop->contains(J) || Worklist.count(J) || 4455 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4456 isScalarUse(J, Src)); 4457 })) { 4458 Worklist.insert(Src); 4459 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4460 } 4461 } 4462 4463 // An induction variable will remain scalar if all users of the induction 4464 // variable and induction variable update remain scalar. 4465 for (auto &Induction : *Legal->getInductionVars()) { 4466 auto *Ind = Induction.first; 4467 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4468 4469 // We already considered pointer induction variables, so there's no reason 4470 // to look at their users again. 4471 // 4472 // TODO: Once we are able to vectorize pointer induction variables we 4473 // should no longer skip over them here. 4474 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4475 continue; 4476 4477 // Determine if all users of the induction variable are scalar after 4478 // vectorization. 4479 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4480 auto *I = cast<Instruction>(U); 4481 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4482 }); 4483 if (!ScalarInd) 4484 continue; 4485 4486 // Determine if all users of the induction variable update instruction are 4487 // scalar after vectorization. 4488 auto ScalarIndUpdate = 4489 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4490 auto *I = cast<Instruction>(U); 4491 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4492 }); 4493 if (!ScalarIndUpdate) 4494 continue; 4495 4496 // The induction variable and its update instruction will remain scalar. 4497 Worklist.insert(Ind); 4498 Worklist.insert(IndUpdate); 4499 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4500 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4501 << "\n"); 4502 } 4503 4504 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4505 } 4506 4507 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4508 if (!blockNeedsPredication(I->getParent())) 4509 return false; 4510 switch(I->getOpcode()) { 4511 default: 4512 break; 4513 case Instruction::Load: 4514 case Instruction::Store: { 4515 if (!Legal->isMaskRequired(I)) 4516 return false; 4517 auto *Ptr = getLoadStorePointerOperand(I); 4518 auto *Ty = getMemInstValueType(I); 4519 // We have already decided how to vectorize this instruction, get that 4520 // result. 4521 if (VF > 1) { 4522 InstWidening WideningDecision = getWideningDecision(I, VF); 4523 assert(WideningDecision != CM_Unknown && 4524 "Widening decision should be ready at this moment"); 4525 return WideningDecision == CM_Scalarize; 4526 } 4527 return isa<LoadInst>(I) ? 4528 !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty)) 4529 : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty)); 4530 } 4531 case Instruction::UDiv: 4532 case Instruction::SDiv: 4533 case Instruction::SRem: 4534 case Instruction::URem: 4535 return mayDivideByZero(*I); 4536 } 4537 return false; 4538 } 4539 4540 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4541 unsigned VF) { 4542 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4543 assert(getWideningDecision(I, VF) == CM_Unknown && 4544 "Decision should not be set yet."); 4545 auto *Group = getInterleavedAccessGroup(I); 4546 assert(Group && "Must have a group."); 4547 4548 // If the instruction's allocated size doesn't equal it's type size, it 4549 // requires padding and will be scalarized. 4550 auto &DL = I->getModule()->getDataLayout(); 4551 auto *ScalarTy = getMemInstValueType(I); 4552 if (hasIrregularType(ScalarTy, DL, VF)) 4553 return false; 4554 4555 // Check if masking is required. 4556 // A Group may need masking for one of two reasons: it resides in a block that 4557 // needs predication, or it was decided to use masking to deal with gaps. 4558 bool PredicatedAccessRequiresMasking = 4559 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4560 bool AccessWithGapsRequiresMasking = 4561 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4562 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4563 return true; 4564 4565 // If masked interleaving is required, we expect that the user/target had 4566 // enabled it, because otherwise it either wouldn't have been created or 4567 // it should have been invalidated by the CostModel. 4568 assert(useMaskedInterleavedAccesses(TTI) && 4569 "Masked interleave-groups for predicated accesses are not enabled."); 4570 4571 auto *Ty = getMemInstValueType(I); 4572 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty) 4573 : TTI.isLegalMaskedStore(Ty); 4574 } 4575 4576 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4577 unsigned VF) { 4578 // Get and ensure we have a valid memory instruction. 4579 LoadInst *LI = dyn_cast<LoadInst>(I); 4580 StoreInst *SI = dyn_cast<StoreInst>(I); 4581 assert((LI || SI) && "Invalid memory instruction"); 4582 4583 auto *Ptr = getLoadStorePointerOperand(I); 4584 4585 // In order to be widened, the pointer should be consecutive, first of all. 4586 if (!Legal->isConsecutivePtr(Ptr)) 4587 return false; 4588 4589 // If the instruction is a store located in a predicated block, it will be 4590 // scalarized. 4591 if (isScalarWithPredication(I)) 4592 return false; 4593 4594 // If the instruction's allocated size doesn't equal it's type size, it 4595 // requires padding and will be scalarized. 4596 auto &DL = I->getModule()->getDataLayout(); 4597 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4598 if (hasIrregularType(ScalarTy, DL, VF)) 4599 return false; 4600 4601 return true; 4602 } 4603 4604 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4605 // We should not collect Uniforms more than once per VF. Right now, 4606 // this function is called from collectUniformsAndScalars(), which 4607 // already does this check. Collecting Uniforms for VF=1 does not make any 4608 // sense. 4609 4610 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4611 "This function should not be visited twice for the same VF"); 4612 4613 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4614 // not analyze again. Uniforms.count(VF) will return 1. 4615 Uniforms[VF].clear(); 4616 4617 // We now know that the loop is vectorizable! 4618 // Collect instructions inside the loop that will remain uniform after 4619 // vectorization. 4620 4621 // Global values, params and instructions outside of current loop are out of 4622 // scope. 4623 auto isOutOfScope = [&](Value *V) -> bool { 4624 Instruction *I = dyn_cast<Instruction>(V); 4625 return (!I || !TheLoop->contains(I)); 4626 }; 4627 4628 SetVector<Instruction *> Worklist; 4629 BasicBlock *Latch = TheLoop->getLoopLatch(); 4630 4631 // Start with the conditional branch. If the branch condition is an 4632 // instruction contained in the loop that is only used by the branch, it is 4633 // uniform. 4634 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4635 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) { 4636 Worklist.insert(Cmp); 4637 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); 4638 } 4639 4640 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4641 // are pointers that are treated like consecutive pointers during 4642 // vectorization. The pointer operands of interleaved accesses are an 4643 // example. 4644 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4645 4646 // Holds pointer operands of instructions that are possibly non-uniform. 4647 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4648 4649 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4650 InstWidening WideningDecision = getWideningDecision(I, VF); 4651 assert(WideningDecision != CM_Unknown && 4652 "Widening decision should be ready at this moment"); 4653 4654 return (WideningDecision == CM_Widen || 4655 WideningDecision == CM_Widen_Reverse || 4656 WideningDecision == CM_Interleave); 4657 }; 4658 // Iterate over the instructions in the loop, and collect all 4659 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4660 // that a consecutive-like pointer operand will be scalarized, we collect it 4661 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4662 // getelementptr instruction can be used by both vectorized and scalarized 4663 // memory instructions. For example, if a loop loads and stores from the same 4664 // location, but the store is conditional, the store will be scalarized, and 4665 // the getelementptr won't remain uniform. 4666 for (auto *BB : TheLoop->blocks()) 4667 for (auto &I : *BB) { 4668 // If there's no pointer operand, there's nothing to do. 4669 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4670 if (!Ptr) 4671 continue; 4672 4673 // True if all users of Ptr are memory accesses that have Ptr as their 4674 // pointer operand. 4675 auto UsersAreMemAccesses = 4676 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4677 return getLoadStorePointerOperand(U) == Ptr; 4678 }); 4679 4680 // Ensure the memory instruction will not be scalarized or used by 4681 // gather/scatter, making its pointer operand non-uniform. If the pointer 4682 // operand is used by any instruction other than a memory access, we 4683 // conservatively assume the pointer operand may be non-uniform. 4684 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4685 PossibleNonUniformPtrs.insert(Ptr); 4686 4687 // If the memory instruction will be vectorized and its pointer operand 4688 // is consecutive-like, or interleaving - the pointer operand should 4689 // remain uniform. 4690 else 4691 ConsecutiveLikePtrs.insert(Ptr); 4692 } 4693 4694 // Add to the Worklist all consecutive and consecutive-like pointers that 4695 // aren't also identified as possibly non-uniform. 4696 for (auto *V : ConsecutiveLikePtrs) 4697 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) { 4698 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n"); 4699 Worklist.insert(V); 4700 } 4701 4702 // Expand Worklist in topological order: whenever a new instruction 4703 // is added , its users should be already inside Worklist. It ensures 4704 // a uniform instruction will only be used by uniform instructions. 4705 unsigned idx = 0; 4706 while (idx != Worklist.size()) { 4707 Instruction *I = Worklist[idx++]; 4708 4709 for (auto OV : I->operand_values()) { 4710 // isOutOfScope operands cannot be uniform instructions. 4711 if (isOutOfScope(OV)) 4712 continue; 4713 // First order recurrence Phi's should typically be considered 4714 // non-uniform. 4715 auto *OP = dyn_cast<PHINode>(OV); 4716 if (OP && Legal->isFirstOrderRecurrence(OP)) 4717 continue; 4718 // If all the users of the operand are uniform, then add the 4719 // operand into the uniform worklist. 4720 auto *OI = cast<Instruction>(OV); 4721 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4722 auto *J = cast<Instruction>(U); 4723 return Worklist.count(J) || 4724 (OI == getLoadStorePointerOperand(J) && 4725 isUniformDecision(J, VF)); 4726 })) { 4727 Worklist.insert(OI); 4728 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); 4729 } 4730 } 4731 } 4732 4733 // Returns true if Ptr is the pointer operand of a memory access instruction 4734 // I, and I is known to not require scalarization. 4735 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4736 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4737 }; 4738 4739 // For an instruction to be added into Worklist above, all its users inside 4740 // the loop should also be in Worklist. However, this condition cannot be 4741 // true for phi nodes that form a cyclic dependence. We must process phi 4742 // nodes separately. An induction variable will remain uniform if all users 4743 // of the induction variable and induction variable update remain uniform. 4744 // The code below handles both pointer and non-pointer induction variables. 4745 for (auto &Induction : *Legal->getInductionVars()) { 4746 auto *Ind = Induction.first; 4747 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4748 4749 // Determine if all users of the induction variable are uniform after 4750 // vectorization. 4751 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4752 auto *I = cast<Instruction>(U); 4753 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4754 isVectorizedMemAccessUse(I, Ind); 4755 }); 4756 if (!UniformInd) 4757 continue; 4758 4759 // Determine if all users of the induction variable update instruction are 4760 // uniform after vectorization. 4761 auto UniformIndUpdate = 4762 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4763 auto *I = cast<Instruction>(U); 4764 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4765 isVectorizedMemAccessUse(I, IndUpdate); 4766 }); 4767 if (!UniformIndUpdate) 4768 continue; 4769 4770 // The induction variable and its update instruction will remain uniform. 4771 Worklist.insert(Ind); 4772 Worklist.insert(IndUpdate); 4773 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n"); 4774 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate 4775 << "\n"); 4776 } 4777 4778 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4779 } 4780 4781 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4782 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4783 4784 if (Legal->getRuntimePointerChecking()->Need) { 4785 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4786 "runtime pointer checks needed. Enable vectorization of this " 4787 "loop with '#pragma clang loop vectorize(enable)' when " 4788 "compiling with -Os/-Oz", 4789 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4790 return true; 4791 } 4792 4793 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4794 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4795 "runtime SCEV checks needed. Enable vectorization of this " 4796 "loop with '#pragma clang loop vectorize(enable)' when " 4797 "compiling with -Os/-Oz", 4798 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4799 return true; 4800 } 4801 4802 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4803 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4804 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4805 "runtime stride == 1 checks needed. Enable vectorization of " 4806 "this loop with '#pragma clang loop vectorize(enable)' when " 4807 "compiling with -Os/-Oz", 4808 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4809 return true; 4810 } 4811 4812 return false; 4813 } 4814 4815 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { 4816 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4817 // TODO: It may by useful to do since it's still likely to be dynamically 4818 // uniform if the target can skip. 4819 reportVectorizationFailure( 4820 "Not inserting runtime ptr check for divergent target", 4821 "runtime pointer checks needed. Not enabled for divergent target", 4822 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4823 return None; 4824 } 4825 4826 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4827 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4828 if (TC == 1) { 4829 reportVectorizationFailure("Single iteration (non) loop", 4830 "loop trip count is one, irrelevant for vectorization", 4831 "SingleIterationLoop", ORE, TheLoop); 4832 return None; 4833 } 4834 4835 switch (ScalarEpilogueStatus) { 4836 case CM_ScalarEpilogueAllowed: 4837 return computeFeasibleMaxVF(TC); 4838 case CM_ScalarEpilogueNotNeededUsePredicate: 4839 LLVM_DEBUG( 4840 dbgs() << "LV: vector predicate hint/switch found.\n" 4841 << "LV: Not allowing scalar epilogue, creating predicated " 4842 << "vector loop.\n"); 4843 break; 4844 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4845 // fallthrough as a special case of OptForSize 4846 case CM_ScalarEpilogueNotAllowedOptSize: 4847 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4848 LLVM_DEBUG( 4849 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4850 else 4851 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4852 << "count.\n"); 4853 4854 // Bail if runtime checks are required, which are not good when optimising 4855 // for size. 4856 if (runtimeChecksRequired()) 4857 return None; 4858 break; 4859 } 4860 4861 // Now try the tail folding 4862 4863 // Invalidate interleave groups that require an epilogue if we can't mask 4864 // the interleave-group. 4865 if (!useMaskedInterleavedAccesses(TTI)) 4866 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4867 4868 unsigned MaxVF = computeFeasibleMaxVF(TC); 4869 if (TC > 0 && TC % MaxVF == 0) { 4870 // Accept MaxVF if we do not have a tail. 4871 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4872 return MaxVF; 4873 } 4874 4875 // If we don't know the precise trip count, or if the trip count that we 4876 // found modulo the vectorization factor is not zero, try to fold the tail 4877 // by masking. 4878 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4879 if (Legal->prepareToFoldTailByMasking()) { 4880 FoldTailByMasking = true; 4881 return MaxVF; 4882 } 4883 4884 if (TC == 0) { 4885 reportVectorizationFailure( 4886 "Unable to calculate the loop count due to complex control flow", 4887 "unable to calculate the loop count due to complex control flow", 4888 "UnknownLoopCountComplexCFG", ORE, TheLoop); 4889 return None; 4890 } 4891 4892 reportVectorizationFailure( 4893 "Cannot optimize for size and vectorize at the same time.", 4894 "cannot optimize for size and vectorize at the same time. " 4895 "Enable vectorization of this loop with '#pragma clang loop " 4896 "vectorize(enable)' when compiling with -Os/-Oz", 4897 "NoTailLoopWithOptForSize", ORE, TheLoop); 4898 return None; 4899 } 4900 4901 unsigned 4902 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 4903 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4904 unsigned SmallestType, WidestType; 4905 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4906 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 4907 4908 // Get the maximum safe dependence distance in bits computed by LAA. 4909 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4910 // the memory accesses that is most restrictive (involved in the smallest 4911 // dependence distance). 4912 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 4913 4914 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 4915 4916 unsigned MaxVectorSize = WidestRegister / WidestType; 4917 4918 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4919 << " / " << WidestType << " bits.\n"); 4920 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 4921 << WidestRegister << " bits.\n"); 4922 4923 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 4924 " into one vector!"); 4925 if (MaxVectorSize == 0) { 4926 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 4927 MaxVectorSize = 1; 4928 return MaxVectorSize; 4929 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 4930 isPowerOf2_32(ConstTripCount)) { 4931 // We need to clamp the VF to be the ConstTripCount. There is no point in 4932 // choosing a higher viable VF as done in the loop below. 4933 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 4934 << ConstTripCount << "\n"); 4935 MaxVectorSize = ConstTripCount; 4936 return MaxVectorSize; 4937 } 4938 4939 unsigned MaxVF = MaxVectorSize; 4940 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 4941 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 4942 // Collect all viable vectorization factors larger than the default MaxVF 4943 // (i.e. MaxVectorSize). 4944 SmallVector<unsigned, 8> VFs; 4945 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 4946 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 4947 VFs.push_back(VS); 4948 4949 // For each VF calculate its register usage. 4950 auto RUs = calculateRegisterUsage(VFs); 4951 4952 // Select the largest VF which doesn't require more registers than existing 4953 // ones. 4954 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); 4955 for (int i = RUs.size() - 1; i >= 0; --i) { 4956 if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { 4957 MaxVF = VFs[i]; 4958 break; 4959 } 4960 } 4961 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 4962 if (MaxVF < MinVF) { 4963 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 4964 << ") with target's minimum: " << MinVF << '\n'); 4965 MaxVF = MinVF; 4966 } 4967 } 4968 } 4969 return MaxVF; 4970 } 4971 4972 VectorizationFactor 4973 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 4974 float Cost = expectedCost(1).first; 4975 const float ScalarCost = Cost; 4976 unsigned Width = 1; 4977 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 4978 4979 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 4980 if (ForceVectorization && MaxVF > 1) { 4981 // Ignore scalar width, because the user explicitly wants vectorization. 4982 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 4983 // evaluation. 4984 Cost = std::numeric_limits<float>::max(); 4985 } 4986 4987 for (unsigned i = 2; i <= MaxVF; i *= 2) { 4988 // Notice that the vector loop needs to be executed less times, so 4989 // we need to divide the cost of the vector loops by the width of 4990 // the vector elements. 4991 VectorizationCostTy C = expectedCost(i); 4992 float VectorCost = C.first / (float)i; 4993 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 4994 << " costs: " << (int)VectorCost << ".\n"); 4995 if (!C.second && !ForceVectorization) { 4996 LLVM_DEBUG( 4997 dbgs() << "LV: Not considering vector loop of width " << i 4998 << " because it will not generate any vector instructions.\n"); 4999 continue; 5000 } 5001 if (VectorCost < Cost) { 5002 Cost = VectorCost; 5003 Width = i; 5004 } 5005 } 5006 5007 if (!EnableCondStoresVectorization && NumPredStores) { 5008 reportVectorizationFailure("There are conditional stores.", 5009 "store that is conditionally executed prevents vectorization", 5010 "ConditionalStore", ORE, TheLoop); 5011 Width = 1; 5012 Cost = ScalarCost; 5013 } 5014 5015 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5016 << "LV: Vectorization seems to be not beneficial, " 5017 << "but was forced by a user.\n"); 5018 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5019 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5020 return Factor; 5021 } 5022 5023 std::pair<unsigned, unsigned> 5024 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5025 unsigned MinWidth = -1U; 5026 unsigned MaxWidth = 8; 5027 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5028 5029 // For each block. 5030 for (BasicBlock *BB : TheLoop->blocks()) { 5031 // For each instruction in the loop. 5032 for (Instruction &I : BB->instructionsWithoutDebug()) { 5033 Type *T = I.getType(); 5034 5035 // Skip ignored values. 5036 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) 5037 continue; 5038 5039 // Only examine Loads, Stores and PHINodes. 5040 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5041 continue; 5042 5043 // Examine PHI nodes that are reduction variables. Update the type to 5044 // account for the recurrence type. 5045 if (auto *PN = dyn_cast<PHINode>(&I)) { 5046 if (!Legal->isReductionVariable(PN)) 5047 continue; 5048 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; 5049 T = RdxDesc.getRecurrenceType(); 5050 } 5051 5052 // Examine the stored values. 5053 if (auto *ST = dyn_cast<StoreInst>(&I)) 5054 T = ST->getValueOperand()->getType(); 5055 5056 // Ignore loaded pointer types and stored pointer types that are not 5057 // vectorizable. 5058 // 5059 // FIXME: The check here attempts to predict whether a load or store will 5060 // be vectorized. We only know this for certain after a VF has 5061 // been selected. Here, we assume that if an access can be 5062 // vectorized, it will be. We should also look at extending this 5063 // optimization to non-pointer types. 5064 // 5065 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5066 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5067 continue; 5068 5069 MinWidth = std::min(MinWidth, 5070 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5071 MaxWidth = std::max(MaxWidth, 5072 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5073 } 5074 } 5075 5076 return {MinWidth, MaxWidth}; 5077 } 5078 5079 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5080 unsigned LoopCost) { 5081 // -- The interleave heuristics -- 5082 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5083 // There are many micro-architectural considerations that we can't predict 5084 // at this level. For example, frontend pressure (on decode or fetch) due to 5085 // code size, or the number and capabilities of the execution ports. 5086 // 5087 // We use the following heuristics to select the interleave count: 5088 // 1. If the code has reductions, then we interleave to break the cross 5089 // iteration dependency. 5090 // 2. If the loop is really small, then we interleave to reduce the loop 5091 // overhead. 5092 // 3. We don't interleave if we think that we will spill registers to memory 5093 // due to the increased register pressure. 5094 5095 if (!isScalarEpilogueAllowed()) 5096 return 1; 5097 5098 // We used the distance for the interleave count. 5099 if (Legal->getMaxSafeDepDistBytes() != -1U) 5100 return 1; 5101 5102 // Do not interleave loops with a relatively small trip count. 5103 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5104 if (TC > 1 && TC < TinyTripCountInterleaveThreshold) 5105 return 1; 5106 5107 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); 5108 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5109 << " registers\n"); 5110 5111 if (VF == 1) { 5112 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5113 TargetNumRegisters = ForceTargetNumScalarRegs; 5114 } else { 5115 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5116 TargetNumRegisters = ForceTargetNumVectorRegs; 5117 } 5118 5119 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5120 // We divide by these constants so assume that we have at least one 5121 // instruction that uses at least one register. 5122 R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); 5123 5124 // We calculate the interleave count using the following formula. 5125 // Subtract the number of loop invariants from the number of available 5126 // registers. These registers are used by all of the interleaved instances. 5127 // Next, divide the remaining registers by the number of registers that is 5128 // required by the loop, in order to estimate how many parallel instances 5129 // fit without causing spills. All of this is rounded down if necessary to be 5130 // a power of two. We want power of two interleave count to simplify any 5131 // addressing operations or alignment considerations. 5132 // We also want power of two interleave counts to ensure that the induction 5133 // variable of the vector loop wraps to zero, when tail is folded by masking; 5134 // this currently happens when OptForSize, in which case IC is set to 1 above. 5135 unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / 5136 R.MaxLocalUsers); 5137 5138 // Don't count the induction variable as interleaved. 5139 if (EnableIndVarRegisterHeur) 5140 IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / 5141 std::max(1U, (R.MaxLocalUsers - 1))); 5142 5143 // Clamp the interleave ranges to reasonable counts. 5144 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5145 5146 // Check if the user has overridden the max. 5147 if (VF == 1) { 5148 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5149 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5150 } else { 5151 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5152 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5153 } 5154 5155 // If the trip count is constant, limit the interleave count to be less than 5156 // the trip count divided by VF. 5157 if (TC > 0) { 5158 assert(TC >= VF && "VF exceeds trip count?"); 5159 if ((TC / VF) < MaxInterleaveCount) 5160 MaxInterleaveCount = (TC / VF); 5161 } 5162 5163 // If we did not calculate the cost for VF (because the user selected the VF) 5164 // then we calculate the cost of VF here. 5165 if (LoopCost == 0) 5166 LoopCost = expectedCost(VF).first; 5167 5168 assert(LoopCost && "Non-zero loop cost expected"); 5169 5170 // Clamp the calculated IC to be between the 1 and the max interleave count 5171 // that the target and trip count allows. 5172 if (IC > MaxInterleaveCount) 5173 IC = MaxInterleaveCount; 5174 else if (IC < 1) 5175 IC = 1; 5176 5177 // Interleave if we vectorized this loop and there is a reduction that could 5178 // benefit from interleaving. 5179 if (VF > 1 && !Legal->getReductionVars()->empty()) { 5180 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5181 return IC; 5182 } 5183 5184 // Note that if we've already vectorized the loop we will have done the 5185 // runtime check and so interleaving won't require further checks. 5186 bool InterleavingRequiresRuntimePointerCheck = 5187 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5188 5189 // We want to interleave small loops in order to reduce the loop overhead and 5190 // potentially expose ILP opportunities. 5191 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5192 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5193 // We assume that the cost overhead is 1 and we use the cost model 5194 // to estimate the cost of the loop and interleave until the cost of the 5195 // loop overhead is about 5% of the cost of the loop. 5196 unsigned SmallIC = 5197 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5198 5199 // Interleave until store/load ports (estimated by max interleave count) are 5200 // saturated. 5201 unsigned NumStores = Legal->getNumStores(); 5202 unsigned NumLoads = Legal->getNumLoads(); 5203 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5204 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5205 5206 // If we have a scalar reduction (vector reductions are already dealt with 5207 // by this point), we can increase the critical path length if the loop 5208 // we're interleaving is inside another loop. Limit, by default to 2, so the 5209 // critical path only gets increased by one reduction operation. 5210 if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) { 5211 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5212 SmallIC = std::min(SmallIC, F); 5213 StoresIC = std::min(StoresIC, F); 5214 LoadsIC = std::min(LoadsIC, F); 5215 } 5216 5217 if (EnableLoadStoreRuntimeInterleave && 5218 std::max(StoresIC, LoadsIC) > SmallIC) { 5219 LLVM_DEBUG( 5220 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5221 return std::max(StoresIC, LoadsIC); 5222 } 5223 5224 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5225 return SmallIC; 5226 } 5227 5228 // Interleave if this is a large loop (small loops are already dealt with by 5229 // this point) that could benefit from interleaving. 5230 bool HasReductions = !Legal->getReductionVars()->empty(); 5231 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5232 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5233 return IC; 5234 } 5235 5236 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5237 return 1; 5238 } 5239 5240 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5241 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5242 // This function calculates the register usage by measuring the highest number 5243 // of values that are alive at a single location. Obviously, this is a very 5244 // rough estimation. We scan the loop in a topological order in order and 5245 // assign a number to each instruction. We use RPO to ensure that defs are 5246 // met before their users. We assume that each instruction that has in-loop 5247 // users starts an interval. We record every time that an in-loop value is 5248 // used, so we have a list of the first and last occurrences of each 5249 // instruction. Next, we transpose this data structure into a multi map that 5250 // holds the list of intervals that *end* at a specific location. This multi 5251 // map allows us to perform a linear search. We scan the instructions linearly 5252 // and record each time that a new interval starts, by placing it in a set. 5253 // If we find this value in the multi-map then we remove it from the set. 5254 // The max register usage is the maximum size of the set. 5255 // We also search for instructions that are defined outside the loop, but are 5256 // used inside the loop. We need this number separately from the max-interval 5257 // usage number because when we unroll, loop-invariant values do not take 5258 // more register. 5259 LoopBlocksDFS DFS(TheLoop); 5260 DFS.perform(LI); 5261 5262 RegisterUsage RU; 5263 5264 // Each 'key' in the map opens a new interval. The values 5265 // of the map are the index of the 'last seen' usage of the 5266 // instruction that is the key. 5267 using IntervalMap = DenseMap<Instruction *, unsigned>; 5268 5269 // Maps instruction to its index. 5270 SmallVector<Instruction *, 64> IdxToInstr; 5271 // Marks the end of each interval. 5272 IntervalMap EndPoint; 5273 // Saves the list of instruction indices that are used in the loop. 5274 SmallPtrSet<Instruction *, 8> Ends; 5275 // Saves the list of values that are used in the loop but are 5276 // defined outside the loop, such as arguments and constants. 5277 SmallPtrSet<Value *, 8> LoopInvariants; 5278 5279 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5280 for (Instruction &I : BB->instructionsWithoutDebug()) { 5281 IdxToInstr.push_back(&I); 5282 5283 // Save the end location of each USE. 5284 for (Value *U : I.operands()) { 5285 auto *Instr = dyn_cast<Instruction>(U); 5286 5287 // Ignore non-instruction values such as arguments, constants, etc. 5288 if (!Instr) 5289 continue; 5290 5291 // If this instruction is outside the loop then record it and continue. 5292 if (!TheLoop->contains(Instr)) { 5293 LoopInvariants.insert(Instr); 5294 continue; 5295 } 5296 5297 // Overwrite previous end points. 5298 EndPoint[Instr] = IdxToInstr.size(); 5299 Ends.insert(Instr); 5300 } 5301 } 5302 } 5303 5304 // Saves the list of intervals that end with the index in 'key'. 5305 using InstrList = SmallVector<Instruction *, 2>; 5306 DenseMap<unsigned, InstrList> TransposeEnds; 5307 5308 // Transpose the EndPoints to a list of values that end at each index. 5309 for (auto &Interval : EndPoint) 5310 TransposeEnds[Interval.second].push_back(Interval.first); 5311 5312 SmallPtrSet<Instruction *, 8> OpenIntervals; 5313 5314 // Get the size of the widest register. 5315 unsigned MaxSafeDepDist = -1U; 5316 if (Legal->getMaxSafeDepDistBytes() != -1U) 5317 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5318 unsigned WidestRegister = 5319 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5320 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5321 5322 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5323 SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0); 5324 5325 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5326 5327 // A lambda that gets the register usage for the given type and VF. 5328 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5329 if (Ty->isTokenTy()) 5330 return 0U; 5331 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5332 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5333 }; 5334 5335 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5336 Instruction *I = IdxToInstr[i]; 5337 5338 // Remove all of the instructions that end at this location. 5339 InstrList &List = TransposeEnds[i]; 5340 for (Instruction *ToRemove : List) 5341 OpenIntervals.erase(ToRemove); 5342 5343 // Ignore instructions that are never used within the loop. 5344 if (Ends.find(I) == Ends.end()) 5345 continue; 5346 5347 // Skip ignored values. 5348 if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) 5349 continue; 5350 5351 // For each VF find the maximum usage of registers. 5352 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5353 if (VFs[j] == 1) { 5354 MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); 5355 continue; 5356 } 5357 collectUniformsAndScalars(VFs[j]); 5358 // Count the number of live intervals. 5359 unsigned RegUsage = 0; 5360 for (auto Inst : OpenIntervals) { 5361 // Skip ignored values for VF > 1. 5362 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() || 5363 isScalarAfterVectorization(Inst, VFs[j])) 5364 continue; 5365 RegUsage += GetRegUsage(Inst->getType(), VFs[j]); 5366 } 5367 MaxUsages[j] = std::max(MaxUsages[j], RegUsage); 5368 } 5369 5370 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5371 << OpenIntervals.size() << '\n'); 5372 5373 // Add the current instruction to the list of open intervals. 5374 OpenIntervals.insert(I); 5375 } 5376 5377 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5378 unsigned Invariant = 0; 5379 if (VFs[i] == 1) 5380 Invariant = LoopInvariants.size(); 5381 else { 5382 for (auto Inst : LoopInvariants) 5383 Invariant += GetRegUsage(Inst->getType(), VFs[i]); 5384 } 5385 5386 LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); 5387 LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); 5388 LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant 5389 << '\n'); 5390 5391 RU.LoopInvariantRegs = Invariant; 5392 RU.MaxLocalUsers = MaxUsages[i]; 5393 RUs[i] = RU; 5394 } 5395 5396 return RUs; 5397 } 5398 5399 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5400 // TODO: Cost model for emulated masked load/store is completely 5401 // broken. This hack guides the cost model to use an artificially 5402 // high enough value to practically disable vectorization with such 5403 // operations, except where previously deployed legality hack allowed 5404 // using very low cost values. This is to avoid regressions coming simply 5405 // from moving "masked load/store" check from legality to cost model. 5406 // Masked Load/Gather emulation was previously never allowed. 5407 // Limited number of Masked Store/Scatter emulation was allowed. 5408 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5409 return isa<LoadInst>(I) || 5410 (isa<StoreInst>(I) && 5411 NumPredStores > NumberOfStoresToPredicate); 5412 } 5413 5414 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5415 // If we aren't vectorizing the loop, or if we've already collected the 5416 // instructions to scalarize, there's nothing to do. Collection may already 5417 // have occurred if we have a user-selected VF and are now computing the 5418 // expected cost for interleaving. 5419 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5420 return; 5421 5422 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5423 // not profitable to scalarize any instructions, the presence of VF in the 5424 // map will indicate that we've analyzed it already. 5425 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5426 5427 // Find all the instructions that are scalar with predication in the loop and 5428 // determine if it would be better to not if-convert the blocks they are in. 5429 // If so, we also record the instructions to scalarize. 5430 for (BasicBlock *BB : TheLoop->blocks()) { 5431 if (!blockNeedsPredication(BB)) 5432 continue; 5433 for (Instruction &I : *BB) 5434 if (isScalarWithPredication(&I)) { 5435 ScalarCostsTy ScalarCosts; 5436 // Do not apply discount logic if hacked cost is needed 5437 // for emulated masked memrefs. 5438 if (!useEmulatedMaskMemRefHack(&I) && 5439 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5440 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5441 // Remember that BB will remain after vectorization. 5442 PredicatedBBsAfterVectorization.insert(BB); 5443 } 5444 } 5445 } 5446 5447 int LoopVectorizationCostModel::computePredInstDiscount( 5448 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5449 unsigned VF) { 5450 assert(!isUniformAfterVectorization(PredInst, VF) && 5451 "Instruction marked uniform-after-vectorization will be predicated"); 5452 5453 // Initialize the discount to zero, meaning that the scalar version and the 5454 // vector version cost the same. 5455 int Discount = 0; 5456 5457 // Holds instructions to analyze. The instructions we visit are mapped in 5458 // ScalarCosts. Those instructions are the ones that would be scalarized if 5459 // we find that the scalar version costs less. 5460 SmallVector<Instruction *, 8> Worklist; 5461 5462 // Returns true if the given instruction can be scalarized. 5463 auto canBeScalarized = [&](Instruction *I) -> bool { 5464 // We only attempt to scalarize instructions forming a single-use chain 5465 // from the original predicated block that would otherwise be vectorized. 5466 // Although not strictly necessary, we give up on instructions we know will 5467 // already be scalar to avoid traversing chains that are unlikely to be 5468 // beneficial. 5469 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5470 isScalarAfterVectorization(I, VF)) 5471 return false; 5472 5473 // If the instruction is scalar with predication, it will be analyzed 5474 // separately. We ignore it within the context of PredInst. 5475 if (isScalarWithPredication(I)) 5476 return false; 5477 5478 // If any of the instruction's operands are uniform after vectorization, 5479 // the instruction cannot be scalarized. This prevents, for example, a 5480 // masked load from being scalarized. 5481 // 5482 // We assume we will only emit a value for lane zero of an instruction 5483 // marked uniform after vectorization, rather than VF identical values. 5484 // Thus, if we scalarize an instruction that uses a uniform, we would 5485 // create uses of values corresponding to the lanes we aren't emitting code 5486 // for. This behavior can be changed by allowing getScalarValue to clone 5487 // the lane zero values for uniforms rather than asserting. 5488 for (Use &U : I->operands()) 5489 if (auto *J = dyn_cast<Instruction>(U.get())) 5490 if (isUniformAfterVectorization(J, VF)) 5491 return false; 5492 5493 // Otherwise, we can scalarize the instruction. 5494 return true; 5495 }; 5496 5497 // Compute the expected cost discount from scalarizing the entire expression 5498 // feeding the predicated instruction. We currently only consider expressions 5499 // that are single-use instruction chains. 5500 Worklist.push_back(PredInst); 5501 while (!Worklist.empty()) { 5502 Instruction *I = Worklist.pop_back_val(); 5503 5504 // If we've already analyzed the instruction, there's nothing to do. 5505 if (ScalarCosts.find(I) != ScalarCosts.end()) 5506 continue; 5507 5508 // Compute the cost of the vector instruction. Note that this cost already 5509 // includes the scalarization overhead of the predicated instruction. 5510 unsigned VectorCost = getInstructionCost(I, VF).first; 5511 5512 // Compute the cost of the scalarized instruction. This cost is the cost of 5513 // the instruction as if it wasn't if-converted and instead remained in the 5514 // predicated block. We will scale this cost by block probability after 5515 // computing the scalarization overhead. 5516 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5517 5518 // Compute the scalarization overhead of needed insertelement instructions 5519 // and phi nodes. 5520 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5521 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), 5522 true, false); 5523 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); 5524 } 5525 5526 // Compute the scalarization overhead of needed extractelement 5527 // instructions. For each of the instruction's operands, if the operand can 5528 // be scalarized, add it to the worklist; otherwise, account for the 5529 // overhead. 5530 for (Use &U : I->operands()) 5531 if (auto *J = dyn_cast<Instruction>(U.get())) { 5532 assert(VectorType::isValidElementType(J->getType()) && 5533 "Instruction has non-scalar type"); 5534 if (canBeScalarized(J)) 5535 Worklist.push_back(J); 5536 else if (needsExtract(J, VF)) 5537 ScalarCost += TTI.getScalarizationOverhead( 5538 ToVectorTy(J->getType(),VF), false, true); 5539 } 5540 5541 // Scale the total scalar cost by block probability. 5542 ScalarCost /= getReciprocalPredBlockProb(); 5543 5544 // Compute the discount. A non-negative discount means the vector version 5545 // of the instruction costs more, and scalarizing would be beneficial. 5546 Discount += VectorCost - ScalarCost; 5547 ScalarCosts[I] = ScalarCost; 5548 } 5549 5550 return Discount; 5551 } 5552 5553 LoopVectorizationCostModel::VectorizationCostTy 5554 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5555 VectorizationCostTy Cost; 5556 5557 // For each block. 5558 for (BasicBlock *BB : TheLoop->blocks()) { 5559 VectorizationCostTy BlockCost; 5560 5561 // For each instruction in the old loop. 5562 for (Instruction &I : BB->instructionsWithoutDebug()) { 5563 // Skip ignored values. 5564 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || 5565 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) 5566 continue; 5567 5568 VectorizationCostTy C = getInstructionCost(&I, VF); 5569 5570 // Check if we should override the cost. 5571 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5572 C.first = ForceTargetInstructionCost; 5573 5574 BlockCost.first += C.first; 5575 BlockCost.second |= C.second; 5576 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5577 << " for VF " << VF << " For instruction: " << I 5578 << '\n'); 5579 } 5580 5581 // If we are vectorizing a predicated block, it will have been 5582 // if-converted. This means that the block's instructions (aside from 5583 // stores and instructions that may divide by zero) will now be 5584 // unconditionally executed. For the scalar case, we may not always execute 5585 // the predicated block. Thus, scale the block's cost by the probability of 5586 // executing it. 5587 if (VF == 1 && blockNeedsPredication(BB)) 5588 BlockCost.first /= getReciprocalPredBlockProb(); 5589 5590 Cost.first += BlockCost.first; 5591 Cost.second |= BlockCost.second; 5592 } 5593 5594 return Cost; 5595 } 5596 5597 /// Gets Address Access SCEV after verifying that the access pattern 5598 /// is loop invariant except the induction variable dependence. 5599 /// 5600 /// This SCEV can be sent to the Target in order to estimate the address 5601 /// calculation cost. 5602 static const SCEV *getAddressAccessSCEV( 5603 Value *Ptr, 5604 LoopVectorizationLegality *Legal, 5605 PredicatedScalarEvolution &PSE, 5606 const Loop *TheLoop) { 5607 5608 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5609 if (!Gep) 5610 return nullptr; 5611 5612 // We are looking for a gep with all loop invariant indices except for one 5613 // which should be an induction variable. 5614 auto SE = PSE.getSE(); 5615 unsigned NumOperands = Gep->getNumOperands(); 5616 for (unsigned i = 1; i < NumOperands; ++i) { 5617 Value *Opd = Gep->getOperand(i); 5618 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5619 !Legal->isInductionVariable(Opd)) 5620 return nullptr; 5621 } 5622 5623 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5624 return PSE.getSCEV(Ptr); 5625 } 5626 5627 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5628 return Legal->hasStride(I->getOperand(0)) || 5629 Legal->hasStride(I->getOperand(1)); 5630 } 5631 5632 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5633 unsigned VF) { 5634 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5635 Type *ValTy = getMemInstValueType(I); 5636 auto SE = PSE.getSE(); 5637 5638 unsigned Alignment = getLoadStoreAlignment(I); 5639 unsigned AS = getLoadStoreAddressSpace(I); 5640 Value *Ptr = getLoadStorePointerOperand(I); 5641 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5642 5643 // Figure out whether the access is strided and get the stride value 5644 // if it's known in compile time 5645 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5646 5647 // Get the cost of the scalar memory instruction and address computation. 5648 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5649 5650 // Don't pass *I here, since it is scalar but will actually be part of a 5651 // vectorized loop where the user of it is a vectorized instruction. 5652 Cost += VF * 5653 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 5654 AS); 5655 5656 // Get the overhead of the extractelement and insertelement instructions 5657 // we might create due to scalarization. 5658 Cost += getScalarizationOverhead(I, VF); 5659 5660 // If we have a predicated store, it may not be executed for each vector 5661 // lane. Scale the cost by the probability of executing the predicated 5662 // block. 5663 if (isPredicatedInst(I)) { 5664 Cost /= getReciprocalPredBlockProb(); 5665 5666 if (useEmulatedMaskMemRefHack(I)) 5667 // Artificially setting to a high enough value to practically disable 5668 // vectorization with such operations. 5669 Cost = 3000000; 5670 } 5671 5672 return Cost; 5673 } 5674 5675 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5676 unsigned VF) { 5677 Type *ValTy = getMemInstValueType(I); 5678 Type *VectorTy = ToVectorTy(ValTy, VF); 5679 unsigned Alignment = getLoadStoreAlignment(I); 5680 Value *Ptr = getLoadStorePointerOperand(I); 5681 unsigned AS = getLoadStoreAddressSpace(I); 5682 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5683 5684 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5685 "Stride should be 1 or -1 for consecutive memory access"); 5686 unsigned Cost = 0; 5687 if (Legal->isMaskRequired(I)) 5688 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 5689 else 5690 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); 5691 5692 bool Reverse = ConsecutiveStride < 0; 5693 if (Reverse) 5694 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5695 return Cost; 5696 } 5697 5698 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5699 unsigned VF) { 5700 Type *ValTy = getMemInstValueType(I); 5701 Type *VectorTy = ToVectorTy(ValTy, VF); 5702 unsigned Alignment = getLoadStoreAlignment(I); 5703 unsigned AS = getLoadStoreAddressSpace(I); 5704 if (isa<LoadInst>(I)) { 5705 return TTI.getAddressComputationCost(ValTy) + 5706 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + 5707 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5708 } 5709 StoreInst *SI = cast<StoreInst>(I); 5710 5711 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5712 return TTI.getAddressComputationCost(ValTy) + 5713 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + 5714 (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost( 5715 Instruction::ExtractElement, 5716 VectorTy, VF - 1)); 5717 } 5718 5719 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5720 unsigned VF) { 5721 Type *ValTy = getMemInstValueType(I); 5722 Type *VectorTy = ToVectorTy(ValTy, VF); 5723 unsigned Alignment = getLoadStoreAlignment(I); 5724 Value *Ptr = getLoadStorePointerOperand(I); 5725 5726 return TTI.getAddressComputationCost(VectorTy) + 5727 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5728 Legal->isMaskRequired(I), Alignment); 5729 } 5730 5731 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5732 unsigned VF) { 5733 Type *ValTy = getMemInstValueType(I); 5734 Type *VectorTy = ToVectorTy(ValTy, VF); 5735 unsigned AS = getLoadStoreAddressSpace(I); 5736 5737 auto Group = getInterleavedAccessGroup(I); 5738 assert(Group && "Fail to get an interleaved access group."); 5739 5740 unsigned InterleaveFactor = Group->getFactor(); 5741 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5742 5743 // Holds the indices of existing members in an interleaved load group. 5744 // An interleaved store group doesn't need this as it doesn't allow gaps. 5745 SmallVector<unsigned, 4> Indices; 5746 if (isa<LoadInst>(I)) { 5747 for (unsigned i = 0; i < InterleaveFactor; i++) 5748 if (Group->getMember(i)) 5749 Indices.push_back(i); 5750 } 5751 5752 // Calculate the cost of the whole interleaved group. 5753 bool UseMaskForGaps = 5754 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5755 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5756 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5757 Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); 5758 5759 if (Group->isReverse()) { 5760 // TODO: Add support for reversed masked interleaved access. 5761 assert(!Legal->isMaskRequired(I) && 5762 "Reverse masked interleaved access not supported."); 5763 Cost += Group->getNumMembers() * 5764 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5765 } 5766 return Cost; 5767 } 5768 5769 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5770 unsigned VF) { 5771 // Calculate scalar cost only. Vectorization cost should be ready at this 5772 // moment. 5773 if (VF == 1) { 5774 Type *ValTy = getMemInstValueType(I); 5775 unsigned Alignment = getLoadStoreAlignment(I); 5776 unsigned AS = getLoadStoreAddressSpace(I); 5777 5778 return TTI.getAddressComputationCost(ValTy) + 5779 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); 5780 } 5781 return getWideningCost(I, VF); 5782 } 5783 5784 LoopVectorizationCostModel::VectorizationCostTy 5785 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5786 // If we know that this instruction will remain uniform, check the cost of 5787 // the scalar version. 5788 if (isUniformAfterVectorization(I, VF)) 5789 VF = 1; 5790 5791 if (VF > 1 && isProfitableToScalarize(I, VF)) 5792 return VectorizationCostTy(InstsToScalarize[VF][I], false); 5793 5794 // Forced scalars do not have any scalarization overhead. 5795 auto ForcedScalar = ForcedScalars.find(VF); 5796 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 5797 auto InstSet = ForcedScalar->second; 5798 if (InstSet.find(I) != InstSet.end()) 5799 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 5800 } 5801 5802 Type *VectorTy; 5803 unsigned C = getInstructionCost(I, VF, VectorTy); 5804 5805 bool TypeNotScalarized = 5806 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 5807 return VectorizationCostTy(C, TypeNotScalarized); 5808 } 5809 5810 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 5811 unsigned VF) { 5812 5813 if (VF == 1) 5814 return 0; 5815 5816 unsigned Cost = 0; 5817 Type *RetTy = ToVectorTy(I->getType(), VF); 5818 if (!RetTy->isVoidTy() && 5819 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 5820 Cost += TTI.getScalarizationOverhead(RetTy, true, false); 5821 5822 // Some targets keep addresses scalar. 5823 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 5824 return Cost; 5825 5826 // Some targets support efficient element stores. 5827 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 5828 return Cost; 5829 5830 // Collect operands to consider. 5831 CallInst *CI = dyn_cast<CallInst>(I); 5832 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 5833 5834 // Skip operands that do not require extraction/scalarization and do not incur 5835 // any overhead. 5836 return Cost + TTI.getOperandsScalarizationOverhead( 5837 filterExtractingOperands(Ops, VF), VF); 5838 } 5839 5840 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 5841 if (VF == 1) 5842 return; 5843 NumPredStores = 0; 5844 for (BasicBlock *BB : TheLoop->blocks()) { 5845 // For each instruction in the old loop. 5846 for (Instruction &I : *BB) { 5847 Value *Ptr = getLoadStorePointerOperand(&I); 5848 if (!Ptr) 5849 continue; 5850 5851 // TODO: We should generate better code and update the cost model for 5852 // predicated uniform stores. Today they are treated as any other 5853 // predicated store (see added test cases in 5854 // invariant-store-vectorization.ll). 5855 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 5856 NumPredStores++; 5857 5858 if (Legal->isUniform(Ptr) && 5859 // Conditional loads and stores should be scalarized and predicated. 5860 // isScalarWithPredication cannot be used here since masked 5861 // gather/scatters are not considered scalar with predication. 5862 !Legal->blockNeedsPredication(I.getParent())) { 5863 // TODO: Avoid replicating loads and stores instead of 5864 // relying on instcombine to remove them. 5865 // Load: Scalar load + broadcast 5866 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 5867 unsigned Cost = getUniformMemOpCost(&I, VF); 5868 setWideningDecision(&I, VF, CM_Scalarize, Cost); 5869 continue; 5870 } 5871 5872 // We assume that widening is the best solution when possible. 5873 if (memoryInstructionCanBeWidened(&I, VF)) { 5874 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 5875 int ConsecutiveStride = 5876 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 5877 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5878 "Expected consecutive stride."); 5879 InstWidening Decision = 5880 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 5881 setWideningDecision(&I, VF, Decision, Cost); 5882 continue; 5883 } 5884 5885 // Choose between Interleaving, Gather/Scatter or Scalarization. 5886 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 5887 unsigned NumAccesses = 1; 5888 if (isAccessInterleaved(&I)) { 5889 auto Group = getInterleavedAccessGroup(&I); 5890 assert(Group && "Fail to get an interleaved access group."); 5891 5892 // Make one decision for the whole group. 5893 if (getWideningDecision(&I, VF) != CM_Unknown) 5894 continue; 5895 5896 NumAccesses = Group->getNumMembers(); 5897 if (interleavedAccessCanBeWidened(&I, VF)) 5898 InterleaveCost = getInterleaveGroupCost(&I, VF); 5899 } 5900 5901 unsigned GatherScatterCost = 5902 isLegalGatherOrScatter(&I) 5903 ? getGatherScatterCost(&I, VF) * NumAccesses 5904 : std::numeric_limits<unsigned>::max(); 5905 5906 unsigned ScalarizationCost = 5907 getMemInstScalarizationCost(&I, VF) * NumAccesses; 5908 5909 // Choose better solution for the current VF, 5910 // write down this decision and use it during vectorization. 5911 unsigned Cost; 5912 InstWidening Decision; 5913 if (InterleaveCost <= GatherScatterCost && 5914 InterleaveCost < ScalarizationCost) { 5915 Decision = CM_Interleave; 5916 Cost = InterleaveCost; 5917 } else if (GatherScatterCost < ScalarizationCost) { 5918 Decision = CM_GatherScatter; 5919 Cost = GatherScatterCost; 5920 } else { 5921 Decision = CM_Scalarize; 5922 Cost = ScalarizationCost; 5923 } 5924 // If the instructions belongs to an interleave group, the whole group 5925 // receives the same decision. The whole group receives the cost, but 5926 // the cost will actually be assigned to one instruction. 5927 if (auto Group = getInterleavedAccessGroup(&I)) 5928 setWideningDecision(Group, VF, Decision, Cost); 5929 else 5930 setWideningDecision(&I, VF, Decision, Cost); 5931 } 5932 } 5933 5934 // Make sure that any load of address and any other address computation 5935 // remains scalar unless there is gather/scatter support. This avoids 5936 // inevitable extracts into address registers, and also has the benefit of 5937 // activating LSR more, since that pass can't optimize vectorized 5938 // addresses. 5939 if (TTI.prefersVectorizedAddressing()) 5940 return; 5941 5942 // Start with all scalar pointer uses. 5943 SmallPtrSet<Instruction *, 8> AddrDefs; 5944 for (BasicBlock *BB : TheLoop->blocks()) 5945 for (Instruction &I : *BB) { 5946 Instruction *PtrDef = 5947 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 5948 if (PtrDef && TheLoop->contains(PtrDef) && 5949 getWideningDecision(&I, VF) != CM_GatherScatter) 5950 AddrDefs.insert(PtrDef); 5951 } 5952 5953 // Add all instructions used to generate the addresses. 5954 SmallVector<Instruction *, 4> Worklist; 5955 for (auto *I : AddrDefs) 5956 Worklist.push_back(I); 5957 while (!Worklist.empty()) { 5958 Instruction *I = Worklist.pop_back_val(); 5959 for (auto &Op : I->operands()) 5960 if (auto *InstOp = dyn_cast<Instruction>(Op)) 5961 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 5962 AddrDefs.insert(InstOp).second) 5963 Worklist.push_back(InstOp); 5964 } 5965 5966 for (auto *I : AddrDefs) { 5967 if (isa<LoadInst>(I)) { 5968 // Setting the desired widening decision should ideally be handled in 5969 // by cost functions, but since this involves the task of finding out 5970 // if the loaded register is involved in an address computation, it is 5971 // instead changed here when we know this is the case. 5972 InstWidening Decision = getWideningDecision(I, VF); 5973 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 5974 // Scalarize a widened load of address. 5975 setWideningDecision(I, VF, CM_Scalarize, 5976 (VF * getMemoryInstructionCost(I, 1))); 5977 else if (auto Group = getInterleavedAccessGroup(I)) { 5978 // Scalarize an interleave group of address loads. 5979 for (unsigned I = 0; I < Group->getFactor(); ++I) { 5980 if (Instruction *Member = Group->getMember(I)) 5981 setWideningDecision(Member, VF, CM_Scalarize, 5982 (VF * getMemoryInstructionCost(Member, 1))); 5983 } 5984 } 5985 } else 5986 // Make sure I gets scalarized and a cost estimate without 5987 // scalarization overhead. 5988 ForcedScalars[VF].insert(I); 5989 } 5990 } 5991 5992 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 5993 unsigned VF, 5994 Type *&VectorTy) { 5995 Type *RetTy = I->getType(); 5996 if (canTruncateToMinimalBitwidth(I, VF)) 5997 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 5998 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 5999 auto SE = PSE.getSE(); 6000 6001 // TODO: We need to estimate the cost of intrinsic calls. 6002 switch (I->getOpcode()) { 6003 case Instruction::GetElementPtr: 6004 // We mark this instruction as zero-cost because the cost of GEPs in 6005 // vectorized code depends on whether the corresponding memory instruction 6006 // is scalarized or not. Therefore, we handle GEPs with the memory 6007 // instruction cost. 6008 return 0; 6009 case Instruction::Br: { 6010 // In cases of scalarized and predicated instructions, there will be VF 6011 // predicated blocks in the vectorized loop. Each branch around these 6012 // blocks requires also an extract of its vector compare i1 element. 6013 bool ScalarPredicatedBB = false; 6014 BranchInst *BI = cast<BranchInst>(I); 6015 if (VF > 1 && BI->isConditional() && 6016 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != 6017 PredicatedBBsAfterVectorization.end() || 6018 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != 6019 PredicatedBBsAfterVectorization.end())) 6020 ScalarPredicatedBB = true; 6021 6022 if (ScalarPredicatedBB) { 6023 // Return cost for branches around scalarized and predicated blocks. 6024 Type *Vec_i1Ty = 6025 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6026 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + 6027 (TTI.getCFInstrCost(Instruction::Br) * VF)); 6028 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6029 // The back-edge branch will remain, as will all scalar branches. 6030 return TTI.getCFInstrCost(Instruction::Br); 6031 else 6032 // This branch will be eliminated by if-conversion. 6033 return 0; 6034 // Note: We currently assume zero cost for an unconditional branch inside 6035 // a predicated block since it will become a fall-through, although we 6036 // may decide in the future to call TTI for all branches. 6037 } 6038 case Instruction::PHI: { 6039 auto *Phi = cast<PHINode>(I); 6040 6041 // First-order recurrences are replaced by vector shuffles inside the loop. 6042 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6043 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6044 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6045 VectorTy, VF - 1, VectorType::get(RetTy, 1)); 6046 6047 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6048 // converted into select instructions. We require N - 1 selects per phi 6049 // node, where N is the number of incoming values. 6050 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6051 return (Phi->getNumIncomingValues() - 1) * 6052 TTI.getCmpSelInstrCost( 6053 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6054 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); 6055 6056 return TTI.getCFInstrCost(Instruction::PHI); 6057 } 6058 case Instruction::UDiv: 6059 case Instruction::SDiv: 6060 case Instruction::URem: 6061 case Instruction::SRem: 6062 // If we have a predicated instruction, it may not be executed for each 6063 // vector lane. Get the scalarization cost and scale this amount by the 6064 // probability of executing the predicated block. If the instruction is not 6065 // predicated, we fall through to the next case. 6066 if (VF > 1 && isScalarWithPredication(I)) { 6067 unsigned Cost = 0; 6068 6069 // These instructions have a non-void type, so account for the phi nodes 6070 // that we will create. This cost is likely to be zero. The phi node 6071 // cost, if any, should be scaled by the block probability because it 6072 // models a copy at the end of each predicated block. 6073 Cost += VF * TTI.getCFInstrCost(Instruction::PHI); 6074 6075 // The cost of the non-predicated instruction. 6076 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); 6077 6078 // The cost of insertelement and extractelement instructions needed for 6079 // scalarization. 6080 Cost += getScalarizationOverhead(I, VF); 6081 6082 // Scale the cost by the probability of executing the predicated blocks. 6083 // This assumes the predicated block for each vector lane is equally 6084 // likely. 6085 return Cost / getReciprocalPredBlockProb(); 6086 } 6087 LLVM_FALLTHROUGH; 6088 case Instruction::Add: 6089 case Instruction::FAdd: 6090 case Instruction::Sub: 6091 case Instruction::FSub: 6092 case Instruction::Mul: 6093 case Instruction::FMul: 6094 case Instruction::FDiv: 6095 case Instruction::FRem: 6096 case Instruction::Shl: 6097 case Instruction::LShr: 6098 case Instruction::AShr: 6099 case Instruction::And: 6100 case Instruction::Or: 6101 case Instruction::Xor: { 6102 // Since we will replace the stride by 1 the multiplication should go away. 6103 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6104 return 0; 6105 // Certain instructions can be cheaper to vectorize if they have a constant 6106 // second vector operand. One example of this are shifts on x86. 6107 Value *Op2 = I->getOperand(1); 6108 TargetTransformInfo::OperandValueProperties Op2VP; 6109 TargetTransformInfo::OperandValueKind Op2VK = 6110 TTI.getOperandInfo(Op2, Op2VP); 6111 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6112 Op2VK = TargetTransformInfo::OK_UniformValue; 6113 6114 SmallVector<const Value *, 4> Operands(I->operand_values()); 6115 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6116 return N * TTI.getArithmeticInstrCost( 6117 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6118 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands); 6119 } 6120 case Instruction::FNeg: { 6121 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6122 return N * TTI.getArithmeticInstrCost( 6123 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6124 TargetTransformInfo::OK_AnyValue, 6125 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6126 I->getOperand(0)); 6127 } 6128 case Instruction::Select: { 6129 SelectInst *SI = cast<SelectInst>(I); 6130 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6131 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6132 Type *CondTy = SI->getCondition()->getType(); 6133 if (!ScalarCond) 6134 CondTy = VectorType::get(CondTy, VF); 6135 6136 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); 6137 } 6138 case Instruction::ICmp: 6139 case Instruction::FCmp: { 6140 Type *ValTy = I->getOperand(0)->getType(); 6141 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6142 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6143 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6144 VectorTy = ToVectorTy(ValTy, VF); 6145 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); 6146 } 6147 case Instruction::Store: 6148 case Instruction::Load: { 6149 unsigned Width = VF; 6150 if (Width > 1) { 6151 InstWidening Decision = getWideningDecision(I, Width); 6152 assert(Decision != CM_Unknown && 6153 "CM decision should be taken at this point"); 6154 if (Decision == CM_Scalarize) 6155 Width = 1; 6156 } 6157 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6158 return getMemoryInstructionCost(I, VF); 6159 } 6160 case Instruction::ZExt: 6161 case Instruction::SExt: 6162 case Instruction::FPToUI: 6163 case Instruction::FPToSI: 6164 case Instruction::FPExt: 6165 case Instruction::PtrToInt: 6166 case Instruction::IntToPtr: 6167 case Instruction::SIToFP: 6168 case Instruction::UIToFP: 6169 case Instruction::Trunc: 6170 case Instruction::FPTrunc: 6171 case Instruction::BitCast: { 6172 // We optimize the truncation of induction variables having constant 6173 // integer steps. The cost of these truncations is the same as the scalar 6174 // operation. 6175 if (isOptimizableIVTruncate(I, VF)) { 6176 auto *Trunc = cast<TruncInst>(I); 6177 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6178 Trunc->getSrcTy(), Trunc); 6179 } 6180 6181 Type *SrcScalarTy = I->getOperand(0)->getType(); 6182 Type *SrcVecTy = 6183 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6184 if (canTruncateToMinimalBitwidth(I, VF)) { 6185 // This cast is going to be shrunk. This may remove the cast or it might 6186 // turn it into slightly different cast. For example, if MinBW == 16, 6187 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6188 // 6189 // Calculate the modified src and dest types. 6190 Type *MinVecTy = VectorTy; 6191 if (I->getOpcode() == Instruction::Trunc) { 6192 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6193 VectorTy = 6194 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6195 } else if (I->getOpcode() == Instruction::ZExt || 6196 I->getOpcode() == Instruction::SExt) { 6197 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6198 VectorTy = 6199 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6200 } 6201 } 6202 6203 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6204 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); 6205 } 6206 case Instruction::Call: { 6207 bool NeedToScalarize; 6208 CallInst *CI = cast<CallInst>(I); 6209 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6210 if (getVectorIntrinsicIDForCall(CI, TLI)) 6211 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6212 return CallCost; 6213 } 6214 default: 6215 // The cost of executing VF copies of the scalar instruction. This opcode 6216 // is unknown. Assume that it is the same as 'mul'. 6217 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + 6218 getScalarizationOverhead(I, VF); 6219 } // end of switch. 6220 } 6221 6222 char LoopVectorize::ID = 0; 6223 6224 static const char lv_name[] = "Loop Vectorization"; 6225 6226 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6227 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6228 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6229 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6230 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6231 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6232 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6233 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6234 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6235 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6236 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6237 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6238 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6239 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6240 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6241 6242 namespace llvm { 6243 6244 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6245 6246 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6247 bool VectorizeOnlyWhenForced) { 6248 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6249 } 6250 6251 } // end namespace llvm 6252 6253 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6254 // Check if the pointer operand of a load or store instruction is 6255 // consecutive. 6256 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6257 return Legal->isConsecutivePtr(Ptr); 6258 return false; 6259 } 6260 6261 void LoopVectorizationCostModel::collectValuesToIgnore() { 6262 // Ignore ephemeral values. 6263 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6264 6265 // Ignore type-promoting instructions we identified during reduction 6266 // detection. 6267 for (auto &Reduction : *Legal->getReductionVars()) { 6268 RecurrenceDescriptor &RedDes = Reduction.second; 6269 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6270 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6271 } 6272 // Ignore type-casting instructions we identified during induction 6273 // detection. 6274 for (auto &Induction : *Legal->getInductionVars()) { 6275 InductionDescriptor &IndDes = Induction.second; 6276 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6277 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6278 } 6279 } 6280 6281 // TODO: we could return a pair of values that specify the max VF and 6282 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6283 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6284 // doesn't have a cost model that can choose which plan to execute if 6285 // more than one is generated. 6286 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6287 LoopVectorizationCostModel &CM) { 6288 unsigned WidestType; 6289 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6290 return WidestVectorRegBits / WidestType; 6291 } 6292 6293 VectorizationFactor 6294 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6295 unsigned VF = UserVF; 6296 // Outer loop handling: They may require CFG and instruction level 6297 // transformations before even evaluating whether vectorization is profitable. 6298 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6299 // the vectorization pipeline. 6300 if (!OrigLoop->empty()) { 6301 // If the user doesn't provide a vectorization factor, determine a 6302 // reasonable one. 6303 if (!UserVF) { 6304 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6305 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6306 6307 // Make sure we have a VF > 1 for stress testing. 6308 if (VPlanBuildStressTest && VF < 2) { 6309 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6310 << "overriding computed VF.\n"); 6311 VF = 4; 6312 } 6313 } 6314 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6315 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6316 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6317 << " to build VPlans.\n"); 6318 buildVPlans(VF, VF); 6319 6320 // For VPlan build stress testing, we bail out after VPlan construction. 6321 if (VPlanBuildStressTest) 6322 return VectorizationFactor::Disabled(); 6323 6324 return {VF, 0}; 6325 } 6326 6327 LLVM_DEBUG( 6328 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6329 "VPlan-native path.\n"); 6330 return VectorizationFactor::Disabled(); 6331 } 6332 6333 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { 6334 assert(OrigLoop->empty() && "Inner loop expected."); 6335 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); 6336 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6337 return None; 6338 6339 // Invalidate interleave groups if all blocks of loop will be predicated. 6340 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6341 !useMaskedInterleavedAccesses(*TTI)) { 6342 LLVM_DEBUG( 6343 dbgs() 6344 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6345 "which requires masked-interleaved support.\n"); 6346 CM.InterleaveInfo.reset(); 6347 } 6348 6349 if (UserVF) { 6350 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6351 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6352 // Collect the instructions (and their associated costs) that will be more 6353 // profitable to scalarize. 6354 CM.selectUserVectorizationFactor(UserVF); 6355 buildVPlansWithVPRecipes(UserVF, UserVF); 6356 LLVM_DEBUG(printPlans(dbgs())); 6357 return {{UserVF, 0}}; 6358 } 6359 6360 unsigned MaxVF = MaybeMaxVF.getValue(); 6361 assert(MaxVF != 0 && "MaxVF is zero."); 6362 6363 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6364 // Collect Uniform and Scalar instructions after vectorization with VF. 6365 CM.collectUniformsAndScalars(VF); 6366 6367 // Collect the instructions (and their associated costs) that will be more 6368 // profitable to scalarize. 6369 if (VF > 1) 6370 CM.collectInstsToScalarize(VF); 6371 } 6372 6373 buildVPlansWithVPRecipes(1, MaxVF); 6374 LLVM_DEBUG(printPlans(dbgs())); 6375 if (MaxVF == 1) 6376 return VectorizationFactor::Disabled(); 6377 6378 // Select the optimal vectorization factor. 6379 return CM.selectVectorizationFactor(MaxVF); 6380 } 6381 6382 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6383 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6384 << '\n'); 6385 BestVF = VF; 6386 BestUF = UF; 6387 6388 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6389 return !Plan->hasVF(VF); 6390 }); 6391 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6392 } 6393 6394 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6395 DominatorTree *DT) { 6396 // Perform the actual loop transformation. 6397 6398 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6399 VPCallbackILV CallbackILV(ILV); 6400 6401 VPTransformState State{BestVF, BestUF, LI, 6402 DT, ILV.Builder, ILV.VectorLoopValueMap, 6403 &ILV, CallbackILV}; 6404 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6405 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6406 6407 //===------------------------------------------------===// 6408 // 6409 // Notice: any optimization or new instruction that go 6410 // into the code below should also be implemented in 6411 // the cost-model. 6412 // 6413 //===------------------------------------------------===// 6414 6415 // 2. Copy and widen instructions from the old loop into the new loop. 6416 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6417 VPlans.front()->execute(&State); 6418 6419 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6420 // predication, updating analyses. 6421 ILV.fixVectorizedLoop(); 6422 } 6423 6424 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6425 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6426 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6427 6428 // We create new control-flow for the vectorized loop, so the original 6429 // condition will be dead after vectorization if it's only used by the 6430 // branch. 6431 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6432 if (Cmp && Cmp->hasOneUse()) 6433 DeadInstructions.insert(Cmp); 6434 6435 // We create new "steps" for induction variable updates to which the original 6436 // induction variables map. An original update instruction will be dead if 6437 // all its users except the induction variable are dead. 6438 for (auto &Induction : *Legal->getInductionVars()) { 6439 PHINode *Ind = Induction.first; 6440 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6441 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6442 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != 6443 DeadInstructions.end(); 6444 })) 6445 DeadInstructions.insert(IndUpdate); 6446 6447 // We record as "Dead" also the type-casting instructions we had identified 6448 // during induction analysis. We don't need any handling for them in the 6449 // vectorized loop because we have proven that, under a proper runtime 6450 // test guarding the vectorized loop, the value of the phi, and the casted 6451 // value of the phi, are the same. The last instruction in this casting chain 6452 // will get its scalar/vector/widened def from the scalar/vector/widened def 6453 // of the respective phi node. Any other casts in the induction def-use chain 6454 // have no other uses outside the phi update chain, and will be ignored. 6455 InductionDescriptor &IndDes = Induction.second; 6456 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6457 DeadInstructions.insert(Casts.begin(), Casts.end()); 6458 } 6459 } 6460 6461 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6462 6463 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6464 6465 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6466 Instruction::BinaryOps BinOp) { 6467 // When unrolling and the VF is 1, we only need to add a simple scalar. 6468 Type *Ty = Val->getType(); 6469 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6470 6471 if (Ty->isFloatingPointTy()) { 6472 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6473 6474 // Floating point operations had to be 'fast' to enable the unrolling. 6475 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6476 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6477 } 6478 Constant *C = ConstantInt::get(Ty, StartIdx); 6479 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6480 } 6481 6482 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6483 SmallVector<Metadata *, 4> MDs; 6484 // Reserve first location for self reference to the LoopID metadata node. 6485 MDs.push_back(nullptr); 6486 bool IsUnrollMetadata = false; 6487 MDNode *LoopID = L->getLoopID(); 6488 if (LoopID) { 6489 // First find existing loop unrolling disable metadata. 6490 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6491 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6492 if (MD) { 6493 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6494 IsUnrollMetadata = 6495 S && S->getString().startswith("llvm.loop.unroll.disable"); 6496 } 6497 MDs.push_back(LoopID->getOperand(i)); 6498 } 6499 } 6500 6501 if (!IsUnrollMetadata) { 6502 // Add runtime unroll disable metadata. 6503 LLVMContext &Context = L->getHeader()->getContext(); 6504 SmallVector<Metadata *, 1> DisableOperands; 6505 DisableOperands.push_back( 6506 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6507 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6508 MDs.push_back(DisableNode); 6509 MDNode *NewLoopID = MDNode::get(Context, MDs); 6510 // Set operand 0 to refer to the loop id itself. 6511 NewLoopID->replaceOperandWith(0, NewLoopID); 6512 L->setLoopID(NewLoopID); 6513 } 6514 } 6515 6516 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6517 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6518 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6519 bool PredicateAtRangeStart = Predicate(Range.Start); 6520 6521 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6522 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6523 Range.End = TmpVF; 6524 break; 6525 } 6526 6527 return PredicateAtRangeStart; 6528 } 6529 6530 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6531 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6532 /// of VF's starting at a given VF and extending it as much as possible. Each 6533 /// vectorization decision can potentially shorten this sub-range during 6534 /// buildVPlan(). 6535 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6536 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6537 VFRange SubRange = {VF, MaxVF + 1}; 6538 VPlans.push_back(buildVPlan(SubRange)); 6539 VF = SubRange.End; 6540 } 6541 } 6542 6543 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6544 VPlanPtr &Plan) { 6545 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6546 6547 // Look for cached value. 6548 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6549 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6550 if (ECEntryIt != EdgeMaskCache.end()) 6551 return ECEntryIt->second; 6552 6553 VPValue *SrcMask = createBlockInMask(Src, Plan); 6554 6555 // The terminator has to be a branch inst! 6556 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6557 assert(BI && "Unexpected terminator found"); 6558 6559 if (!BI->isConditional()) 6560 return EdgeMaskCache[Edge] = SrcMask; 6561 6562 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6563 assert(EdgeMask && "No Edge Mask found for condition"); 6564 6565 if (BI->getSuccessor(0) != Dst) 6566 EdgeMask = Builder.createNot(EdgeMask); 6567 6568 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6569 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6570 6571 return EdgeMaskCache[Edge] = EdgeMask; 6572 } 6573 6574 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6575 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6576 6577 // Look for cached value. 6578 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6579 if (BCEntryIt != BlockMaskCache.end()) 6580 return BCEntryIt->second; 6581 6582 // All-one mask is modelled as no-mask following the convention for masked 6583 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6584 VPValue *BlockMask = nullptr; 6585 6586 if (OrigLoop->getHeader() == BB) { 6587 if (!CM.blockNeedsPredication(BB)) 6588 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6589 6590 // Introduce the early-exit compare IV <= BTC to form header block mask. 6591 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6592 VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6593 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6594 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6595 return BlockMaskCache[BB] = BlockMask; 6596 } 6597 6598 // This is the block mask. We OR all incoming edges. 6599 for (auto *Predecessor : predecessors(BB)) { 6600 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6601 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6602 return BlockMaskCache[BB] = EdgeMask; 6603 6604 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6605 BlockMask = EdgeMask; 6606 continue; 6607 } 6608 6609 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6610 } 6611 6612 return BlockMaskCache[BB] = BlockMask; 6613 } 6614 6615 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, 6616 VFRange &Range, 6617 VPlanPtr &Plan) { 6618 const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I); 6619 if (!IG) 6620 return nullptr; 6621 6622 // Now check if IG is relevant for VF's in the given range. 6623 auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> { 6624 return [=](unsigned VF) -> bool { 6625 return (VF >= 2 && // Query is illegal for VF == 1 6626 CM.getWideningDecision(I, VF) == 6627 LoopVectorizationCostModel::CM_Interleave); 6628 }; 6629 }; 6630 if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range)) 6631 return nullptr; 6632 6633 // I is a member of an InterleaveGroup for VF's in the (possibly trimmed) 6634 // range. If it's the primary member of the IG construct a VPInterleaveRecipe. 6635 // Otherwise, it's an adjunct member of the IG, do not construct any Recipe. 6636 assert(I == IG->getInsertPos() && 6637 "Generating a recipe for an adjunct member of an interleave group"); 6638 6639 VPValue *Mask = nullptr; 6640 if (Legal->isMaskRequired(I)) 6641 Mask = createBlockInMask(I->getParent(), Plan); 6642 6643 return new VPInterleaveRecipe(IG, Mask); 6644 } 6645 6646 VPWidenMemoryInstructionRecipe * 6647 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6648 VPlanPtr &Plan) { 6649 if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) 6650 return nullptr; 6651 6652 auto willWiden = [&](unsigned VF) -> bool { 6653 if (VF == 1) 6654 return false; 6655 if (CM.isScalarAfterVectorization(I, VF) || 6656 CM.isProfitableToScalarize(I, VF)) 6657 return false; 6658 LoopVectorizationCostModel::InstWidening Decision = 6659 CM.getWideningDecision(I, VF); 6660 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6661 "CM decision should be taken at this point."); 6662 assert(Decision != LoopVectorizationCostModel::CM_Interleave && 6663 "Interleave memory opportunity should be caught earlier."); 6664 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6665 }; 6666 6667 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6668 return nullptr; 6669 6670 VPValue *Mask = nullptr; 6671 if (Legal->isMaskRequired(I)) 6672 Mask = createBlockInMask(I->getParent(), Plan); 6673 6674 return new VPWidenMemoryInstructionRecipe(*I, Mask); 6675 } 6676 6677 VPWidenIntOrFpInductionRecipe * 6678 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { 6679 if (PHINode *Phi = dyn_cast<PHINode>(I)) { 6680 // Check if this is an integer or fp induction. If so, build the recipe that 6681 // produces its scalar and vector values. 6682 InductionDescriptor II = Legal->getInductionVars()->lookup(Phi); 6683 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6684 II.getKind() == InductionDescriptor::IK_FpInduction) 6685 return new VPWidenIntOrFpInductionRecipe(Phi); 6686 6687 return nullptr; 6688 } 6689 6690 // Optimize the special case where the source is a constant integer 6691 // induction variable. Notice that we can only optimize the 'trunc' case 6692 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6693 // (c) other casts depend on pointer size. 6694 6695 // Determine whether \p K is a truncation based on an induction variable that 6696 // can be optimized. 6697 auto isOptimizableIVTruncate = 6698 [&](Instruction *K) -> std::function<bool(unsigned)> { 6699 return 6700 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6701 }; 6702 6703 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( 6704 isOptimizableIVTruncate(I), Range)) 6705 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6706 cast<TruncInst>(I)); 6707 return nullptr; 6708 } 6709 6710 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { 6711 PHINode *Phi = dyn_cast<PHINode>(I); 6712 if (!Phi || Phi->getParent() == OrigLoop->getHeader()) 6713 return nullptr; 6714 6715 // We know that all PHIs in non-header blocks are converted into selects, so 6716 // we don't have to worry about the insertion order and we can just use the 6717 // builder. At this point we generate the predication tree. There may be 6718 // duplications since this is a simple recursive scan, but future 6719 // optimizations will clean it up. 6720 6721 SmallVector<VPValue *, 2> Masks; 6722 unsigned NumIncoming = Phi->getNumIncomingValues(); 6723 for (unsigned In = 0; In < NumIncoming; In++) { 6724 VPValue *EdgeMask = 6725 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6726 assert((EdgeMask || NumIncoming == 1) && 6727 "Multiple predecessors with one having a full mask"); 6728 if (EdgeMask) 6729 Masks.push_back(EdgeMask); 6730 } 6731 return new VPBlendRecipe(Phi, Masks); 6732 } 6733 6734 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, 6735 VFRange &Range) { 6736 6737 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6738 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6739 6740 if (IsPredicated) 6741 return false; 6742 6743 auto IsVectorizableOpcode = [](unsigned Opcode) { 6744 switch (Opcode) { 6745 case Instruction::Add: 6746 case Instruction::And: 6747 case Instruction::AShr: 6748 case Instruction::BitCast: 6749 case Instruction::Br: 6750 case Instruction::Call: 6751 case Instruction::FAdd: 6752 case Instruction::FCmp: 6753 case Instruction::FDiv: 6754 case Instruction::FMul: 6755 case Instruction::FNeg: 6756 case Instruction::FPExt: 6757 case Instruction::FPToSI: 6758 case Instruction::FPToUI: 6759 case Instruction::FPTrunc: 6760 case Instruction::FRem: 6761 case Instruction::FSub: 6762 case Instruction::GetElementPtr: 6763 case Instruction::ICmp: 6764 case Instruction::IntToPtr: 6765 case Instruction::Load: 6766 case Instruction::LShr: 6767 case Instruction::Mul: 6768 case Instruction::Or: 6769 case Instruction::PHI: 6770 case Instruction::PtrToInt: 6771 case Instruction::SDiv: 6772 case Instruction::Select: 6773 case Instruction::SExt: 6774 case Instruction::Shl: 6775 case Instruction::SIToFP: 6776 case Instruction::SRem: 6777 case Instruction::Store: 6778 case Instruction::Sub: 6779 case Instruction::Trunc: 6780 case Instruction::UDiv: 6781 case Instruction::UIToFP: 6782 case Instruction::URem: 6783 case Instruction::Xor: 6784 case Instruction::ZExt: 6785 return true; 6786 } 6787 return false; 6788 }; 6789 6790 if (!IsVectorizableOpcode(I->getOpcode())) 6791 return false; 6792 6793 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6794 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6795 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6796 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6797 return false; 6798 } 6799 6800 auto willWiden = [&](unsigned VF) -> bool { 6801 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || 6802 CM.isProfitableToScalarize(I, VF))) 6803 return false; 6804 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6805 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6806 // The following case may be scalarized depending on the VF. 6807 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6808 // version of the instruction. 6809 // Is it beneficial to perform intrinsic call compared to lib call? 6810 bool NeedToScalarize; 6811 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6812 bool UseVectorIntrinsic = 6813 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6814 return UseVectorIntrinsic || !NeedToScalarize; 6815 } 6816 if (isa<LoadInst>(I) || isa<StoreInst>(I)) { 6817 assert(CM.getWideningDecision(I, VF) == 6818 LoopVectorizationCostModel::CM_Scalarize && 6819 "Memory widening decisions should have been taken care by now"); 6820 return false; 6821 } 6822 return true; 6823 }; 6824 6825 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6826 return false; 6827 6828 // Success: widen this instruction. We optimize the common case where 6829 // consecutive instructions can be represented by a single recipe. 6830 if (!VPBB->empty()) { 6831 VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back()); 6832 if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I)) 6833 return true; 6834 } 6835 6836 VPBB->appendRecipe(new VPWidenRecipe(I)); 6837 return true; 6838 } 6839 6840 VPBasicBlock *VPRecipeBuilder::handleReplication( 6841 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 6842 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 6843 VPlanPtr &Plan) { 6844 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 6845 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 6846 Range); 6847 6848 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6849 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6850 6851 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); 6852 6853 // Find if I uses a predicated instruction. If so, it will use its scalar 6854 // value. Avoid hoisting the insert-element which packs the scalar value into 6855 // a vector value, as that happens iff all users use the vector value. 6856 for (auto &Op : I->operands()) 6857 if (auto *PredInst = dyn_cast<Instruction>(Op)) 6858 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 6859 PredInst2Recipe[PredInst]->setAlsoPack(false); 6860 6861 // Finalize the recipe for Instr, first if it is not predicated. 6862 if (!IsPredicated) { 6863 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 6864 VPBB->appendRecipe(Recipe); 6865 return VPBB; 6866 } 6867 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 6868 assert(VPBB->getSuccessors().empty() && 6869 "VPBB has successors when handling predicated replication."); 6870 // Record predicated instructions for above packing optimizations. 6871 PredInst2Recipe[I] = Recipe; 6872 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 6873 VPBlockUtils::insertBlockAfter(Region, VPBB); 6874 auto *RegSucc = new VPBasicBlock(); 6875 VPBlockUtils::insertBlockAfter(RegSucc, Region); 6876 return RegSucc; 6877 } 6878 6879 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 6880 VPRecipeBase *PredRecipe, 6881 VPlanPtr &Plan) { 6882 // Instructions marked for predication are replicated and placed under an 6883 // if-then construct to prevent side-effects. 6884 6885 // Generate recipes to compute the block mask for this region. 6886 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 6887 6888 // Build the triangular if-then region. 6889 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 6890 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 6891 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 6892 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 6893 auto *PHIRecipe = 6894 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 6895 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 6896 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 6897 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 6898 6899 // Note: first set Entry as region entry and then connect successors starting 6900 // from it in order, to propagate the "parent" of each VPBasicBlock. 6901 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 6902 VPBlockUtils::connectBlocks(Pred, Exit); 6903 6904 return Region; 6905 } 6906 6907 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, 6908 VPlanPtr &Plan, VPBasicBlock *VPBB) { 6909 VPRecipeBase *Recipe = nullptr; 6910 // Check if Instr should belong to an interleave memory recipe, or already 6911 // does. In the latter case Instr is irrelevant. 6912 if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) { 6913 VPBB->appendRecipe(Recipe); 6914 return true; 6915 } 6916 6917 // Check if Instr is a memory operation that should be widened. 6918 if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) { 6919 VPBB->appendRecipe(Recipe); 6920 return true; 6921 } 6922 6923 // Check if Instr should form some PHI recipe. 6924 if ((Recipe = tryToOptimizeInduction(Instr, Range))) { 6925 VPBB->appendRecipe(Recipe); 6926 return true; 6927 } 6928 if ((Recipe = tryToBlend(Instr, Plan))) { 6929 VPBB->appendRecipe(Recipe); 6930 return true; 6931 } 6932 if (PHINode *Phi = dyn_cast<PHINode>(Instr)) { 6933 VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); 6934 return true; 6935 } 6936 6937 // Check if Instr is to be widened by a general VPWidenRecipe, after 6938 // having first checked for specific widening recipes that deal with 6939 // Interleave Groups, Inductions and Phi nodes. 6940 if (tryToWiden(Instr, VPBB, Range)) 6941 return true; 6942 6943 return false; 6944 } 6945 6946 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 6947 unsigned MaxVF) { 6948 assert(OrigLoop->empty() && "Inner loop expected."); 6949 6950 // Collect conditions feeding internal conditional branches; they need to be 6951 // represented in VPlan for it to model masking. 6952 SmallPtrSet<Value *, 1> NeedDef; 6953 6954 auto *Latch = OrigLoop->getLoopLatch(); 6955 for (BasicBlock *BB : OrigLoop->blocks()) { 6956 if (BB == Latch) 6957 continue; 6958 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 6959 if (Branch && Branch->isConditional()) 6960 NeedDef.insert(Branch->getCondition()); 6961 } 6962 6963 // If the tail is to be folded by masking, the primary induction variable 6964 // needs to be represented in VPlan for it to model early-exit masking. 6965 // Also, both the Phi and the live-out instruction of each reduction are 6966 // required in order to introduce a select between them in VPlan. 6967 if (CM.foldTailByMasking()) { 6968 NeedDef.insert(Legal->getPrimaryInduction()); 6969 for (auto &Reduction : *Legal->getReductionVars()) { 6970 NeedDef.insert(Reduction.first); 6971 NeedDef.insert(Reduction.second.getLoopExitInstr()); 6972 } 6973 } 6974 6975 // Collect instructions from the original loop that will become trivially dead 6976 // in the vectorized loop. We don't need to vectorize these instructions. For 6977 // example, original induction update instructions can become dead because we 6978 // separately emit induction "steps" when generating code for the new loop. 6979 // Similarly, we create a new latch condition when setting up the structure 6980 // of the new loop, so the old one can become dead. 6981 SmallPtrSet<Instruction *, 4> DeadInstructions; 6982 collectTriviallyDeadInstructions(DeadInstructions); 6983 6984 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6985 VFRange SubRange = {VF, MaxVF + 1}; 6986 VPlans.push_back( 6987 buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions)); 6988 VF = SubRange.End; 6989 } 6990 } 6991 6992 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 6993 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 6994 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6995 // Hold a mapping from predicated instructions to their recipes, in order to 6996 // fix their AlsoPack behavior if a user is determined to replicate and use a 6997 // scalar instead of vector value. 6998 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 6999 7000 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7001 DenseMap<Instruction *, Instruction *> SinkAfterInverse; 7002 7003 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7004 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7005 auto Plan = std::make_unique<VPlan>(VPBB); 7006 7007 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); 7008 // Represent values that will have defs inside VPlan. 7009 for (Value *V : NeedDef) 7010 Plan->addVPValue(V); 7011 7012 // Scan the body of the loop in a topological order to visit each basic block 7013 // after having visited its predecessor basic blocks. 7014 LoopBlocksDFS DFS(OrigLoop); 7015 DFS.perform(LI); 7016 7017 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7018 // Relevant instructions from basic block BB will be grouped into VPRecipe 7019 // ingredients and fill a new VPBasicBlock. 7020 unsigned VPBBsForBB = 0; 7021 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7022 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7023 VPBB = FirstVPBBForBB; 7024 Builder.setInsertPoint(VPBB); 7025 7026 std::vector<Instruction *> Ingredients; 7027 7028 // Organize the ingredients to vectorize from current basic block in the 7029 // right order. 7030 for (Instruction &I : BB->instructionsWithoutDebug()) { 7031 Instruction *Instr = &I; 7032 7033 // First filter out irrelevant instructions, to ensure no recipes are 7034 // built for them. 7035 if (isa<BranchInst>(Instr) || 7036 DeadInstructions.find(Instr) != DeadInstructions.end()) 7037 continue; 7038 7039 // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct 7040 // member of the IG, do not construct any Recipe for it. 7041 const InterleaveGroup<Instruction> *IG = 7042 CM.getInterleavedAccessGroup(Instr); 7043 if (IG && Instr != IG->getInsertPos() && 7044 Range.Start >= 2 && // Query is illegal for VF == 1 7045 CM.getWideningDecision(Instr, Range.Start) == 7046 LoopVectorizationCostModel::CM_Interleave) { 7047 auto SinkCandidate = SinkAfterInverse.find(Instr); 7048 if (SinkCandidate != SinkAfterInverse.end()) 7049 Ingredients.push_back(SinkCandidate->second); 7050 continue; 7051 } 7052 7053 // Move instructions to handle first-order recurrences, step 1: avoid 7054 // handling this instruction until after we've handled the instruction it 7055 // should follow. 7056 auto SAIt = SinkAfter.find(Instr); 7057 if (SAIt != SinkAfter.end()) { 7058 LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" 7059 << *SAIt->second 7060 << " to vectorize a 1st order recurrence.\n"); 7061 SinkAfterInverse[SAIt->second] = Instr; 7062 continue; 7063 } 7064 7065 Ingredients.push_back(Instr); 7066 7067 // Move instructions to handle first-order recurrences, step 2: push the 7068 // instruction to be sunk at its insertion point. 7069 auto SAInvIt = SinkAfterInverse.find(Instr); 7070 if (SAInvIt != SinkAfterInverse.end()) 7071 Ingredients.push_back(SAInvIt->second); 7072 } 7073 7074 // Introduce each ingredient into VPlan. 7075 for (Instruction *Instr : Ingredients) { 7076 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) 7077 continue; 7078 7079 // Otherwise, if all widening options failed, Instruction is to be 7080 // replicated. This may create a successor for VPBB. 7081 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7082 Instr, Range, VPBB, PredInst2Recipe, Plan); 7083 if (NextVPBB != VPBB) { 7084 VPBB = NextVPBB; 7085 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7086 : ""); 7087 } 7088 } 7089 } 7090 7091 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7092 // may also be empty, such as the last one VPBB, reflecting original 7093 // basic-blocks with no recipes. 7094 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7095 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7096 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7097 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7098 delete PreEntry; 7099 7100 // Finally, if tail is folded by masking, introduce selects between the phi 7101 // and the live-out instruction of each reduction, at the end of the latch. 7102 if (CM.foldTailByMasking()) { 7103 Builder.setInsertPoint(VPBB); 7104 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7105 for (auto &Reduction : *Legal->getReductionVars()) { 7106 VPValue *Phi = Plan->getVPValue(Reduction.first); 7107 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7108 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7109 } 7110 } 7111 7112 std::string PlanName; 7113 raw_string_ostream RSO(PlanName); 7114 unsigned VF = Range.Start; 7115 Plan->addVF(VF); 7116 RSO << "Initial VPlan for VF={" << VF; 7117 for (VF *= 2; VF < Range.End; VF *= 2) { 7118 Plan->addVF(VF); 7119 RSO << "," << VF; 7120 } 7121 RSO << "},UF>=1"; 7122 RSO.flush(); 7123 Plan->setName(PlanName); 7124 7125 return Plan; 7126 } 7127 7128 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7129 // Outer loop handling: They may require CFG and instruction level 7130 // transformations before even evaluating whether vectorization is profitable. 7131 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7132 // the vectorization pipeline. 7133 assert(!OrigLoop->empty()); 7134 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7135 7136 // Create new empty VPlan 7137 auto Plan = std::make_unique<VPlan>(); 7138 7139 // Build hierarchical CFG 7140 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7141 HCFGBuilder.buildHierarchicalCFG(); 7142 7143 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7144 Plan->addVF(VF); 7145 7146 if (EnableVPlanPredication) { 7147 VPlanPredicator VPP(*Plan); 7148 VPP.predicate(); 7149 7150 // Avoid running transformation to recipes until masked code generation in 7151 // VPlan-native path is in place. 7152 return Plan; 7153 } 7154 7155 SmallPtrSet<Instruction *, 1> DeadInstructions; 7156 VPlanHCFGTransforms::VPInstructionsToVPRecipes( 7157 Plan, Legal->getInductionVars(), DeadInstructions); 7158 7159 return Plan; 7160 } 7161 7162 Value* LoopVectorizationPlanner::VPCallbackILV:: 7163 getOrCreateVectorValues(Value *V, unsigned Part) { 7164 return ILV.getOrCreateVectorValue(V, Part); 7165 } 7166 7167 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { 7168 O << " +\n" 7169 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7170 IG->getInsertPos()->printAsOperand(O, false); 7171 if (User) { 7172 O << ", "; 7173 User->getOperand(0)->printAsOperand(O); 7174 } 7175 O << "\\l\""; 7176 for (unsigned i = 0; i < IG->getFactor(); ++i) 7177 if (Instruction *I = IG->getMember(i)) 7178 O << " +\n" 7179 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; 7180 } 7181 7182 void VPWidenRecipe::execute(VPTransformState &State) { 7183 for (auto &Instr : make_range(Begin, End)) 7184 State.ILV->widenInstruction(Instr); 7185 } 7186 7187 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7188 assert(!State.Instance && "Int or FP induction being replicated."); 7189 State.ILV->widenIntOrFpInduction(IV, Trunc); 7190 } 7191 7192 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7193 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7194 } 7195 7196 void VPBlendRecipe::execute(VPTransformState &State) { 7197 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7198 // We know that all PHIs in non-header blocks are converted into 7199 // selects, so we don't have to worry about the insertion order and we 7200 // can just use the builder. 7201 // At this point we generate the predication tree. There may be 7202 // duplications since this is a simple recursive scan, but future 7203 // optimizations will clean it up. 7204 7205 unsigned NumIncoming = Phi->getNumIncomingValues(); 7206 7207 assert((User || NumIncoming == 1) && 7208 "Multiple predecessors with predecessors having a full mask"); 7209 // Generate a sequence of selects of the form: 7210 // SELECT(Mask3, In3, 7211 // SELECT(Mask2, In2, 7212 // ( ...))) 7213 InnerLoopVectorizer::VectorParts Entry(State.UF); 7214 for (unsigned In = 0; In < NumIncoming; ++In) { 7215 for (unsigned Part = 0; Part < State.UF; ++Part) { 7216 // We might have single edge PHIs (blocks) - use an identity 7217 // 'select' for the first PHI operand. 7218 Value *In0 = 7219 State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part); 7220 if (In == 0) 7221 Entry[Part] = In0; // Initialize with the first incoming value. 7222 else { 7223 // Select between the current value and the previous incoming edge 7224 // based on the incoming mask. 7225 Value *Cond = State.get(User->getOperand(In), Part); 7226 Entry[Part] = 7227 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7228 } 7229 } 7230 } 7231 for (unsigned Part = 0; Part < State.UF; ++Part) 7232 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7233 } 7234 7235 void VPInterleaveRecipe::execute(VPTransformState &State) { 7236 assert(!State.Instance && "Interleave group being replicated."); 7237 if (!User) 7238 return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); 7239 7240 // Last (and currently only) operand is a mask. 7241 InnerLoopVectorizer::VectorParts MaskValues(State.UF); 7242 VPValue *Mask = User->getOperand(User->getNumOperands() - 1); 7243 for (unsigned Part = 0; Part < State.UF; ++Part) 7244 MaskValues[Part] = State.get(Mask, Part); 7245 State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues); 7246 } 7247 7248 void VPReplicateRecipe::execute(VPTransformState &State) { 7249 if (State.Instance) { // Generate a single instance. 7250 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); 7251 // Insert scalar instance packing it into a vector. 7252 if (AlsoPack && State.VF > 1) { 7253 // If we're constructing lane 0, initialize to start from undef. 7254 if (State.Instance->Lane == 0) { 7255 Value *Undef = 7256 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7257 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7258 } 7259 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7260 } 7261 return; 7262 } 7263 7264 // Generate scalar instances for all VF lanes of all UF parts, unless the 7265 // instruction is uniform inwhich case generate only the first lane for each 7266 // of the UF parts. 7267 unsigned EndLane = IsUniform ? 1 : State.VF; 7268 for (unsigned Part = 0; Part < State.UF; ++Part) 7269 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7270 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); 7271 } 7272 7273 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7274 assert(State.Instance && "Branch on Mask works only on single instance."); 7275 7276 unsigned Part = State.Instance->Part; 7277 unsigned Lane = State.Instance->Lane; 7278 7279 Value *ConditionBit = nullptr; 7280 if (!User) // Block in mask is all-one. 7281 ConditionBit = State.Builder.getTrue(); 7282 else { 7283 VPValue *BlockInMask = User->getOperand(0); 7284 ConditionBit = State.get(BlockInMask, Part); 7285 if (ConditionBit->getType()->isVectorTy()) 7286 ConditionBit = State.Builder.CreateExtractElement( 7287 ConditionBit, State.Builder.getInt32(Lane)); 7288 } 7289 7290 // Replace the temporary unreachable terminator with a new conditional branch, 7291 // whose two destinations will be set later when they are created. 7292 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7293 assert(isa<UnreachableInst>(CurrentTerminator) && 7294 "Expected to replace unreachable terminator with conditional branch."); 7295 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7296 CondBr->setSuccessor(0, nullptr); 7297 ReplaceInstWithInst(CurrentTerminator, CondBr); 7298 } 7299 7300 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7301 assert(State.Instance && "Predicated instruction PHI works per instance."); 7302 Instruction *ScalarPredInst = cast<Instruction>( 7303 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7304 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7305 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7306 assert(PredicatingBB && "Predicated block has no single predecessor."); 7307 7308 // By current pack/unpack logic we need to generate only a single phi node: if 7309 // a vector value for the predicated instruction exists at this point it means 7310 // the instruction has vector users only, and a phi for the vector value is 7311 // needed. In this case the recipe of the predicated instruction is marked to 7312 // also do that packing, thereby "hoisting" the insert-element sequence. 7313 // Otherwise, a phi node for the scalar value is needed. 7314 unsigned Part = State.Instance->Part; 7315 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7316 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7317 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7318 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7319 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7320 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7321 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7322 } else { 7323 Type *PredInstType = PredInst->getType(); 7324 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7325 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7326 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7327 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7328 } 7329 } 7330 7331 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7332 if (!User) 7333 return State.ILV->vectorizeMemoryInstruction(&Instr); 7334 7335 // Last (and currently only) operand is a mask. 7336 InnerLoopVectorizer::VectorParts MaskValues(State.UF); 7337 VPValue *Mask = User->getOperand(User->getNumOperands() - 1); 7338 for (unsigned Part = 0; Part < State.UF; ++Part) 7339 MaskValues[Part] = State.get(Mask, Part); 7340 State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); 7341 } 7342 7343 static ScalarEpilogueLowering 7344 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, 7345 ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { 7346 ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; 7347 if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && 7348 (F->hasOptSize() || 7349 llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) 7350 SEL = CM_ScalarEpilogueNotAllowedOptSize; 7351 else if (PreferPredicateOverEpilog || Hints.getPredicate()) 7352 SEL = CM_ScalarEpilogueNotNeededUsePredicate; 7353 7354 return SEL; 7355 } 7356 7357 // Process the loop in the VPlan-native vectorization path. This path builds 7358 // VPlan upfront in the vectorization pipeline, which allows to apply 7359 // VPlan-to-VPlan transformations from the very beginning without modifying the 7360 // input LLVM IR. 7361 static bool processLoopInVPlanNativePath( 7362 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7363 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7364 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7365 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7366 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7367 7368 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7369 Function *F = L->getHeader()->getParent(); 7370 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7371 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); 7372 7373 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7374 &Hints, IAI); 7375 // Use the planner for outer loop vectorization. 7376 // TODO: CM is not used at this point inside the planner. Turn CM into an 7377 // optional argument if we don't need it in the future. 7378 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM); 7379 7380 // Get user vectorization factor. 7381 const unsigned UserVF = Hints.getWidth(); 7382 7383 // Plan how to best vectorize, return the best VF and its cost. 7384 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7385 7386 // If we are stress testing VPlan builds, do not attempt to generate vector 7387 // code. Masked vector code generation support will follow soon. 7388 // Also, do not attempt to vectorize if no vector code will be produced. 7389 if (VPlanBuildStressTest || EnableVPlanPredication || 7390 VectorizationFactor::Disabled() == VF) 7391 return false; 7392 7393 LVP.setBestPlan(VF.Width, 1); 7394 7395 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7396 &CM); 7397 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7398 << L->getHeader()->getParent()->getName() << "\"\n"); 7399 LVP.executePlan(LB, DT); 7400 7401 // Mark the loop as already vectorized to avoid vectorizing again. 7402 Hints.setAlreadyVectorized(); 7403 7404 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7405 return true; 7406 } 7407 7408 bool LoopVectorizePass::processLoop(Loop *L) { 7409 assert((EnableVPlanNativePath || L->empty()) && 7410 "VPlan-native path is not enabled. Only process inner loops."); 7411 7412 #ifndef NDEBUG 7413 const std::string DebugLocStr = getDebugLocString(L); 7414 #endif /* NDEBUG */ 7415 7416 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7417 << L->getHeader()->getParent()->getName() << "\" from " 7418 << DebugLocStr << "\n"); 7419 7420 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7421 7422 LLVM_DEBUG( 7423 dbgs() << "LV: Loop hints:" 7424 << " force=" 7425 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7426 ? "disabled" 7427 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7428 ? "enabled" 7429 : "?")) 7430 << " width=" << Hints.getWidth() 7431 << " unroll=" << Hints.getInterleave() << "\n"); 7432 7433 // Function containing loop 7434 Function *F = L->getHeader()->getParent(); 7435 7436 // Looking at the diagnostic output is the only way to determine if a loop 7437 // was vectorized (other than looking at the IR or machine code), so it 7438 // is important to generate an optimization remark for each loop. Most of 7439 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7440 // generated as OptimizationRemark and OptimizationRemarkMissed are 7441 // less verbose reporting vectorized loops and unvectorized loops that may 7442 // benefit from vectorization, respectively. 7443 7444 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7445 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7446 return false; 7447 } 7448 7449 PredicatedScalarEvolution PSE(*SE, *L); 7450 7451 // Check if it is legal to vectorize the loop. 7452 LoopVectorizationRequirements Requirements(*ORE); 7453 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7454 &Requirements, &Hints, DB, AC); 7455 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7456 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7457 Hints.emitRemarkWithHints(); 7458 return false; 7459 } 7460 7461 // Check the function attributes and profiles to find out if this function 7462 // should be optimized for size. 7463 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); 7464 7465 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7466 // here. They may require CFG and instruction level transformations before 7467 // even evaluating whether vectorization is profitable. Since we cannot modify 7468 // the incoming IR, we need to build VPlan upfront in the vectorization 7469 // pipeline. 7470 if (!L->empty()) 7471 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7472 ORE, BFI, PSI, Hints); 7473 7474 assert(L->empty() && "Inner loop expected."); 7475 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7476 // count by optimizing for size, to minimize overheads. 7477 // Prefer constant trip counts over profile data, over upper bound estimate. 7478 unsigned ExpectedTC = 0; 7479 bool HasExpectedTC = false; 7480 if (const SCEVConstant *ConstExits = 7481 dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) { 7482 const APInt &ExitsCount = ConstExits->getAPInt(); 7483 // We are interested in small values for ExpectedTC. Skip over those that 7484 // can't fit an unsigned. 7485 if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) { 7486 ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1; 7487 HasExpectedTC = true; 7488 } 7489 } 7490 // ExpectedTC may be large because it's bound by a variable. Check 7491 // profiling information to validate we should vectorize. 7492 if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) { 7493 auto EstimatedTC = getLoopEstimatedTripCount(L); 7494 if (EstimatedTC) { 7495 ExpectedTC = *EstimatedTC; 7496 HasExpectedTC = true; 7497 } 7498 } 7499 if (!HasExpectedTC) { 7500 ExpectedTC = SE->getSmallConstantMaxTripCount(L); 7501 HasExpectedTC = (ExpectedTC > 0); 7502 } 7503 7504 if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) { 7505 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7506 << "This loop is worth vectorizing only if no scalar " 7507 << "iteration overheads are incurred."); 7508 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7509 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7510 else { 7511 LLVM_DEBUG(dbgs() << "\n"); 7512 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7513 } 7514 } 7515 7516 // Check the function attributes to see if implicit floats are allowed. 7517 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7518 // an integer loop and the vector instructions selected are purely integer 7519 // vector instructions? 7520 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7521 reportVectorizationFailure( 7522 "Can't vectorize when the NoImplicitFloat attribute is used", 7523 "loop not vectorized due to NoImplicitFloat attribute", 7524 "NoImplicitFloat", ORE, L); 7525 Hints.emitRemarkWithHints(); 7526 return false; 7527 } 7528 7529 // Check if the target supports potentially unsafe FP vectorization. 7530 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7531 // for the target we're vectorizing for, to make sure none of the 7532 // additional fp-math flags can help. 7533 if (Hints.isPotentiallyUnsafe() && 7534 TTI->isFPVectorizationPotentiallyUnsafe()) { 7535 reportVectorizationFailure( 7536 "Potentially unsafe FP op prevents vectorization", 7537 "loop not vectorized due to unsafe FP support.", 7538 "UnsafeFP", ORE, L); 7539 Hints.emitRemarkWithHints(); 7540 return false; 7541 } 7542 7543 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7544 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7545 7546 // If an override option has been passed in for interleaved accesses, use it. 7547 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7548 UseInterleaved = EnableInterleavedMemAccesses; 7549 7550 // Analyze interleaved memory accesses. 7551 if (UseInterleaved) { 7552 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7553 } 7554 7555 // Use the cost model. 7556 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7557 F, &Hints, IAI); 7558 CM.collectValuesToIgnore(); 7559 7560 // Use the planner for vectorization. 7561 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM); 7562 7563 // Get user vectorization factor. 7564 unsigned UserVF = Hints.getWidth(); 7565 7566 // Plan how to best vectorize, return the best VF and its cost. 7567 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); 7568 7569 VectorizationFactor VF = VectorizationFactor::Disabled(); 7570 unsigned IC = 1; 7571 unsigned UserIC = Hints.getInterleave(); 7572 7573 if (MaybeVF) { 7574 VF = *MaybeVF; 7575 // Select the interleave count. 7576 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7577 } 7578 7579 // Identify the diagnostic messages that should be produced. 7580 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7581 bool VectorizeLoop = true, InterleaveLoop = true; 7582 if (Requirements.doesNotMeet(F, L, Hints)) { 7583 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7584 "requirements.\n"); 7585 Hints.emitRemarkWithHints(); 7586 return false; 7587 } 7588 7589 if (VF.Width == 1) { 7590 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7591 VecDiagMsg = std::make_pair( 7592 "VectorizationNotBeneficial", 7593 "the cost-model indicates that vectorization is not beneficial"); 7594 VectorizeLoop = false; 7595 } 7596 7597 if (!MaybeVF && UserIC > 1) { 7598 // Tell the user interleaving was avoided up-front, despite being explicitly 7599 // requested. 7600 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7601 "interleaving should be avoided up front\n"); 7602 IntDiagMsg = std::make_pair( 7603 "InterleavingAvoided", 7604 "Ignoring UserIC, because interleaving was avoided up front"); 7605 InterleaveLoop = false; 7606 } else if (IC == 1 && UserIC <= 1) { 7607 // Tell the user interleaving is not beneficial. 7608 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7609 IntDiagMsg = std::make_pair( 7610 "InterleavingNotBeneficial", 7611 "the cost-model indicates that interleaving is not beneficial"); 7612 InterleaveLoop = false; 7613 if (UserIC == 1) { 7614 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7615 IntDiagMsg.second += 7616 " and is explicitly disabled or interleave count is set to 1"; 7617 } 7618 } else if (IC > 1 && UserIC == 1) { 7619 // Tell the user interleaving is beneficial, but it explicitly disabled. 7620 LLVM_DEBUG( 7621 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7622 IntDiagMsg = std::make_pair( 7623 "InterleavingBeneficialButDisabled", 7624 "the cost-model indicates that interleaving is beneficial " 7625 "but is explicitly disabled or interleave count is set to 1"); 7626 InterleaveLoop = false; 7627 } 7628 7629 // Override IC if user provided an interleave count. 7630 IC = UserIC > 0 ? UserIC : IC; 7631 7632 // Emit diagnostic messages, if any. 7633 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7634 if (!VectorizeLoop && !InterleaveLoop) { 7635 // Do not vectorize or interleaving the loop. 7636 ORE->emit([&]() { 7637 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7638 L->getStartLoc(), L->getHeader()) 7639 << VecDiagMsg.second; 7640 }); 7641 ORE->emit([&]() { 7642 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7643 L->getStartLoc(), L->getHeader()) 7644 << IntDiagMsg.second; 7645 }); 7646 return false; 7647 } else if (!VectorizeLoop && InterleaveLoop) { 7648 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7649 ORE->emit([&]() { 7650 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7651 L->getStartLoc(), L->getHeader()) 7652 << VecDiagMsg.second; 7653 }); 7654 } else if (VectorizeLoop && !InterleaveLoop) { 7655 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7656 << ") in " << DebugLocStr << '\n'); 7657 ORE->emit([&]() { 7658 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7659 L->getStartLoc(), L->getHeader()) 7660 << IntDiagMsg.second; 7661 }); 7662 } else if (VectorizeLoop && InterleaveLoop) { 7663 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7664 << ") in " << DebugLocStr << '\n'); 7665 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7666 } 7667 7668 LVP.setBestPlan(VF.Width, IC); 7669 7670 using namespace ore; 7671 bool DisableRuntimeUnroll = false; 7672 MDNode *OrigLoopID = L->getLoopID(); 7673 7674 if (!VectorizeLoop) { 7675 assert(IC > 1 && "interleave count should not be 1 or 0"); 7676 // If we decided that it is not legal to vectorize the loop, then 7677 // interleave it. 7678 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7679 &CM); 7680 LVP.executePlan(Unroller, DT); 7681 7682 ORE->emit([&]() { 7683 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7684 L->getHeader()) 7685 << "interleaved loop (interleaved count: " 7686 << NV("InterleaveCount", IC) << ")"; 7687 }); 7688 } else { 7689 // If we decided that it is *legal* to vectorize the loop, then do it. 7690 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7691 &LVL, &CM); 7692 LVP.executePlan(LB, DT); 7693 ++LoopsVectorized; 7694 7695 // Add metadata to disable runtime unrolling a scalar loop when there are 7696 // no runtime checks about strides and memory. A scalar loop that is 7697 // rarely used is not worth unrolling. 7698 if (!LB.areSafetyChecksAdded()) 7699 DisableRuntimeUnroll = true; 7700 7701 // Report the vectorization decision. 7702 ORE->emit([&]() { 7703 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7704 L->getHeader()) 7705 << "vectorized loop (vectorization width: " 7706 << NV("VectorizationFactor", VF.Width) 7707 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7708 }); 7709 } 7710 7711 Optional<MDNode *> RemainderLoopID = 7712 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7713 LLVMLoopVectorizeFollowupEpilogue}); 7714 if (RemainderLoopID.hasValue()) { 7715 L->setLoopID(RemainderLoopID.getValue()); 7716 } else { 7717 if (DisableRuntimeUnroll) 7718 AddRuntimeUnrollDisableMetaData(L); 7719 7720 // Mark the loop as already vectorized to avoid vectorizing again. 7721 Hints.setAlreadyVectorized(); 7722 } 7723 7724 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7725 return true; 7726 } 7727 7728 bool LoopVectorizePass::runImpl( 7729 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7730 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7731 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 7732 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 7733 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 7734 SE = &SE_; 7735 LI = &LI_; 7736 TTI = &TTI_; 7737 DT = &DT_; 7738 BFI = &BFI_; 7739 TLI = TLI_; 7740 AA = &AA_; 7741 AC = &AC_; 7742 GetLAA = &GetLAA_; 7743 DB = &DB_; 7744 ORE = &ORE_; 7745 PSI = PSI_; 7746 7747 // Don't attempt if 7748 // 1. the target claims to have no vector registers, and 7749 // 2. interleaving won't help ILP. 7750 // 7751 // The second condition is necessary because, even if the target has no 7752 // vector registers, loop vectorization may still enable scalar 7753 // interleaving. 7754 if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) 7755 return false; 7756 7757 bool Changed = false; 7758 7759 // The vectorizer requires loops to be in simplified form. 7760 // Since simplification may add new inner loops, it has to run before the 7761 // legality and profitability checks. This means running the loop vectorizer 7762 // will simplify all loops, regardless of whether anything end up being 7763 // vectorized. 7764 for (auto &L : *LI) 7765 Changed |= 7766 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 7767 7768 // Build up a worklist of inner-loops to vectorize. This is necessary as 7769 // the act of vectorizing or partially unrolling a loop creates new loops 7770 // and can invalidate iterators across the loops. 7771 SmallVector<Loop *, 8> Worklist; 7772 7773 for (Loop *L : *LI) 7774 collectSupportedLoops(*L, LI, ORE, Worklist); 7775 7776 LoopsAnalyzed += Worklist.size(); 7777 7778 // Now walk the identified inner loops. 7779 while (!Worklist.empty()) { 7780 Loop *L = Worklist.pop_back_val(); 7781 7782 // For the inner loops we actually process, form LCSSA to simplify the 7783 // transform. 7784 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 7785 7786 Changed |= processLoop(L); 7787 } 7788 7789 // Process each loop nest in the function. 7790 return Changed; 7791 } 7792 7793 PreservedAnalyses LoopVectorizePass::run(Function &F, 7794 FunctionAnalysisManager &AM) { 7795 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 7796 auto &LI = AM.getResult<LoopAnalysis>(F); 7797 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 7798 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 7799 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 7800 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 7801 auto &AA = AM.getResult<AAManager>(F); 7802 auto &AC = AM.getResult<AssumptionAnalysis>(F); 7803 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 7804 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 7805 MemorySSA *MSSA = EnableMSSALoopDependency 7806 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 7807 : nullptr; 7808 7809 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 7810 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 7811 [&](Loop &L) -> const LoopAccessInfo & { 7812 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 7813 return LAM.getResult<LoopAccessAnalysis>(L, AR); 7814 }; 7815 const ModuleAnalysisManager &MAM = 7816 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); 7817 ProfileSummaryInfo *PSI = 7818 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 7819 bool Changed = 7820 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 7821 if (!Changed) 7822 return PreservedAnalyses::all(); 7823 PreservedAnalyses PA; 7824 7825 // We currently do not preserve loopinfo/dominator analyses with outer loop 7826 // vectorization. Until this is addressed, mark these analyses as preserved 7827 // only for non-VPlan-native path. 7828 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 7829 if (!EnableVPlanNativePath) { 7830 PA.preserve<LoopAnalysis>(); 7831 PA.preserve<DominatorTreeAnalysis>(); 7832 } 7833 PA.preserve<BasicAA>(); 7834 PA.preserve<GlobalsAA>(); 7835 return PA; 7836 } 7837