1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanHCFGTransforms.h" 62 #include "VPlanPredicator.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpander.h" 95 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 96 #include "llvm/Analysis/TargetLibraryInfo.h" 97 #include "llvm/Analysis/TargetTransformInfo.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfoMetadata.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/LLVMContext.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/LoopSimplify.h" 137 #include "llvm/Transforms/Utils/LoopUtils.h" 138 #include "llvm/Transforms/Utils/LoopVersioning.h" 139 #include "llvm/Transforms/Utils/SizeOpts.h" 140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 141 #include <algorithm> 142 #include <cassert> 143 #include <cstdint> 144 #include <cstdlib> 145 #include <functional> 146 #include <iterator> 147 #include <limits> 148 #include <memory> 149 #include <string> 150 #include <tuple> 151 #include <utility> 152 #include <vector> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 /// @{ 160 /// Metadata attribute names 161 static const char *const LLVMLoopVectorizeFollowupAll = 162 "llvm.loop.vectorize.followup_all"; 163 static const char *const LLVMLoopVectorizeFollowupVectorized = 164 "llvm.loop.vectorize.followup_vectorized"; 165 static const char *const LLVMLoopVectorizeFollowupEpilogue = 166 "llvm.loop.vectorize.followup_epilogue"; 167 /// @} 168 169 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 171 172 /// Loops with a known constant trip count below this number are vectorized only 173 /// if no scalar iteration overheads are incurred. 174 static cl::opt<unsigned> TinyTripCountVectorThreshold( 175 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 176 cl::desc("Loops with a constant trip count that is smaller than this " 177 "value are vectorized only if no scalar iteration overheads " 178 "are incurred.")); 179 180 // Indicates that an epilogue is undesired, predication is preferred. 181 // This means that the vectorizer will try to fold the loop-tail (epilogue) 182 // into the loop and predicate the loop body accordingly. 183 static cl::opt<bool> PreferPredicateOverEpilog( 184 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 185 cl::desc("Indicate that an epilogue is undesired, predication should be " 186 "used instead.")); 187 188 static cl::opt<bool> MaximizeBandwidth( 189 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 190 cl::desc("Maximize bandwidth when selecting vectorization factor which " 191 "will be determined by the smallest type in loop.")); 192 193 static cl::opt<bool> EnableInterleavedMemAccesses( 194 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 195 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 196 197 /// An interleave-group may need masking if it resides in a block that needs 198 /// predication, or in order to mask away gaps. 199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 200 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 201 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 202 203 /// We don't interleave loops with a known constant trip count below this 204 /// number. 205 static const unsigned TinyTripCountInterleaveThreshold = 128; 206 207 static cl::opt<unsigned> ForceTargetNumScalarRegs( 208 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 209 cl::desc("A flag that overrides the target's number of scalar registers.")); 210 211 static cl::opt<unsigned> ForceTargetNumVectorRegs( 212 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 213 cl::desc("A flag that overrides the target's number of vector registers.")); 214 215 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 216 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 217 cl::desc("A flag that overrides the target's max interleave factor for " 218 "scalar loops.")); 219 220 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 221 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 222 cl::desc("A flag that overrides the target's max interleave factor for " 223 "vectorized loops.")); 224 225 static cl::opt<unsigned> ForceTargetInstructionCost( 226 "force-target-instruction-cost", cl::init(0), cl::Hidden, 227 cl::desc("A flag that overrides the target's expected cost for " 228 "an instruction to a single constant value. Mostly " 229 "useful for getting consistent testing.")); 230 231 static cl::opt<unsigned> SmallLoopCost( 232 "small-loop-cost", cl::init(20), cl::Hidden, 233 cl::desc( 234 "The cost of a loop that is considered 'small' by the interleaver.")); 235 236 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 237 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 238 cl::desc("Enable the use of the block frequency analysis to access PGO " 239 "heuristics minimizing code growth in cold regions and being more " 240 "aggressive in hot regions.")); 241 242 // Runtime interleave loops for load/store throughput. 243 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 244 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 245 cl::desc( 246 "Enable runtime interleaving until load/store ports are saturated")); 247 248 /// The number of stores in a loop that are allowed to need predication. 249 static cl::opt<unsigned> NumberOfStoresToPredicate( 250 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 251 cl::desc("Max number of stores to be predicated behind an if.")); 252 253 static cl::opt<bool> EnableIndVarRegisterHeur( 254 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 255 cl::desc("Count the induction variable only once when interleaving")); 256 257 static cl::opt<bool> EnableCondStoresVectorization( 258 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 259 cl::desc("Enable if predication of stores during vectorization.")); 260 261 static cl::opt<unsigned> MaxNestedScalarReductionIC( 262 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 263 cl::desc("The maximum interleave count to use when interleaving a scalar " 264 "reduction in a nested loop.")); 265 266 cl::opt<bool> EnableVPlanNativePath( 267 "enable-vplan-native-path", cl::init(false), cl::Hidden, 268 cl::desc("Enable VPlan-native vectorization path with " 269 "support for outer loop vectorization.")); 270 271 // FIXME: Remove this switch once we have divergence analysis. Currently we 272 // assume divergent non-backedge branches when this switch is true. 273 cl::opt<bool> EnableVPlanPredication( 274 "enable-vplan-predication", cl::init(false), cl::Hidden, 275 cl::desc("Enable VPlan-native vectorization path predicator with " 276 "support for outer loop vectorization.")); 277 278 // This flag enables the stress testing of the VPlan H-CFG construction in the 279 // VPlan-native vectorization path. It must be used in conjuction with 280 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 281 // verification of the H-CFGs built. 282 static cl::opt<bool> VPlanBuildStressTest( 283 "vplan-build-stress-test", cl::init(false), cl::Hidden, 284 cl::desc( 285 "Build VPlan for every supported loop nest in the function and bail " 286 "out right after the build (stress test the VPlan H-CFG construction " 287 "in the VPlan-native vectorization path).")); 288 289 cl::opt<bool> llvm::EnableLoopInterleaving( 290 "interleave-loops", cl::init(true), cl::Hidden, 291 cl::desc("Enable loop interleaving in Loop vectorization passes")); 292 cl::opt<bool> llvm::EnableLoopVectorization( 293 "vectorize-loops", cl::init(true), cl::Hidden, 294 cl::desc("Run the Loop vectorization passes")); 295 296 /// A helper function for converting Scalar types to vector types. 297 /// If the incoming type is void, we return void. If the VF is 1, we return 298 /// the scalar type. 299 static Type *ToVectorTy(Type *Scalar, unsigned VF) { 300 if (Scalar->isVoidTy() || VF == 1) 301 return Scalar; 302 return VectorType::get(Scalar, VF); 303 } 304 305 /// A helper function that returns the type of loaded or stored value. 306 static Type *getMemInstValueType(Value *I) { 307 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 308 "Expected Load or Store instruction"); 309 if (auto *LI = dyn_cast<LoadInst>(I)) 310 return LI->getType(); 311 return cast<StoreInst>(I)->getValueOperand()->getType(); 312 } 313 314 /// A helper function that returns true if the given type is irregular. The 315 /// type is irregular if its allocated size doesn't equal the store size of an 316 /// element of the corresponding vector type at the given vectorization factor. 317 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 318 // Determine if an array of VF elements of type Ty is "bitcast compatible" 319 // with a <VF x Ty> vector. 320 if (VF > 1) { 321 auto *VectorTy = VectorType::get(Ty, VF); 322 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 323 } 324 325 // If the vectorization factor is one, we just check if an array of type Ty 326 // requires padding between elements. 327 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 328 } 329 330 /// A helper function that returns the reciprocal of the block probability of 331 /// predicated blocks. If we return X, we are assuming the predicated block 332 /// will execute once for every X iterations of the loop header. 333 /// 334 /// TODO: We should use actual block probability here, if available. Currently, 335 /// we always assume predicated blocks have a 50% chance of executing. 336 static unsigned getReciprocalPredBlockProb() { return 2; } 337 338 /// A helper function that adds a 'fast' flag to floating-point operations. 339 static Value *addFastMathFlag(Value *V) { 340 if (isa<FPMathOperator>(V)) 341 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 342 return V; 343 } 344 345 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 346 if (isa<FPMathOperator>(V)) 347 cast<Instruction>(V)->setFastMathFlags(FMF); 348 return V; 349 } 350 351 /// A helper function that returns an integer or floating-point constant with 352 /// value C. 353 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 354 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 355 : ConstantFP::get(Ty, C); 356 } 357 358 namespace llvm { 359 360 /// InnerLoopVectorizer vectorizes loops which contain only one basic 361 /// block to a specified vectorization factor (VF). 362 /// This class performs the widening of scalars into vectors, or multiple 363 /// scalars. This class also implements the following features: 364 /// * It inserts an epilogue loop for handling loops that don't have iteration 365 /// counts that are known to be a multiple of the vectorization factor. 366 /// * It handles the code generation for reduction variables. 367 /// * Scalarization (implementation using scalars) of un-vectorizable 368 /// instructions. 369 /// InnerLoopVectorizer does not perform any vectorization-legality 370 /// checks, and relies on the caller to check for the different legality 371 /// aspects. The InnerLoopVectorizer relies on the 372 /// LoopVectorizationLegality class to provide information about the induction 373 /// and reduction variables that were found to a given vectorization factor. 374 class InnerLoopVectorizer { 375 public: 376 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 377 LoopInfo *LI, DominatorTree *DT, 378 const TargetLibraryInfo *TLI, 379 const TargetTransformInfo *TTI, AssumptionCache *AC, 380 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 381 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 382 LoopVectorizationCostModel *CM) 383 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 384 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 385 Builder(PSE.getSE()->getContext()), 386 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 387 virtual ~InnerLoopVectorizer() = default; 388 389 /// Create a new empty loop. Unlink the old loop and connect the new one. 390 /// Return the pre-header block of the new loop. 391 BasicBlock *createVectorizedLoopSkeleton(); 392 393 /// Widen a single instruction within the innermost loop. 394 void widenInstruction(Instruction &I); 395 396 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 397 void fixVectorizedLoop(); 398 399 // Return true if any runtime check is added. 400 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 401 402 /// A type for vectorized values in the new loop. Each value from the 403 /// original loop, when vectorized, is represented by UF vector values in the 404 /// new unrolled loop, where UF is the unroll factor. 405 using VectorParts = SmallVector<Value *, 2>; 406 407 /// Vectorize a single PHINode in a block. This method handles the induction 408 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 409 /// arbitrary length vectors. 410 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 411 412 /// A helper function to scalarize a single Instruction in the innermost loop. 413 /// Generates a sequence of scalar instances for each lane between \p MinLane 414 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 415 /// inclusive.. 416 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, 417 bool IfPredicateInstr); 418 419 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 420 /// is provided, the integer induction variable will first be truncated to 421 /// the corresponding type. 422 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 423 424 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 425 /// vector or scalar value on-demand if one is not yet available. When 426 /// vectorizing a loop, we visit the definition of an instruction before its 427 /// uses. When visiting the definition, we either vectorize or scalarize the 428 /// instruction, creating an entry for it in the corresponding map. (In some 429 /// cases, such as induction variables, we will create both vector and scalar 430 /// entries.) Then, as we encounter uses of the definition, we derive values 431 /// for each scalar or vector use unless such a value is already available. 432 /// For example, if we scalarize a definition and one of its uses is vector, 433 /// we build the required vector on-demand with an insertelement sequence 434 /// when visiting the use. Otherwise, if the use is scalar, we can use the 435 /// existing scalar definition. 436 /// 437 /// Return a value in the new loop corresponding to \p V from the original 438 /// loop at unroll index \p Part. If the value has already been vectorized, 439 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 440 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 441 /// a new vector value on-demand by inserting the scalar values into a vector 442 /// with an insertelement sequence. If the value has been neither vectorized 443 /// nor scalarized, it must be loop invariant, so we simply broadcast the 444 /// value into a vector. 445 Value *getOrCreateVectorValue(Value *V, unsigned Part); 446 447 /// Return a value in the new loop corresponding to \p V from the original 448 /// loop at unroll and vector indices \p Instance. If the value has been 449 /// vectorized but not scalarized, the necessary extractelement instruction 450 /// will be generated. 451 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 452 453 /// Construct the vector value of a scalarized value \p V one lane at a time. 454 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 455 456 /// Try to vectorize the interleaved access group that \p Instr belongs to, 457 /// optionally masking the vector operations if \p BlockInMask is non-null. 458 void vectorizeInterleaveGroup(Instruction *Instr, 459 VectorParts *BlockInMask = nullptr); 460 461 /// Vectorize Load and Store instructions, optionally masking the vector 462 /// operations if \p BlockInMask is non-null. 463 void vectorizeMemoryInstruction(Instruction *Instr, 464 VectorParts *BlockInMask = nullptr); 465 466 /// Set the debug location in the builder using the debug location in 467 /// the instruction. 468 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 469 470 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 471 void fixNonInductionPHIs(void); 472 473 protected: 474 friend class LoopVectorizationPlanner; 475 476 /// A small list of PHINodes. 477 using PhiVector = SmallVector<PHINode *, 4>; 478 479 /// A type for scalarized values in the new loop. Each value from the 480 /// original loop, when scalarized, is represented by UF x VF scalar values 481 /// in the new unrolled loop, where UF is the unroll factor and VF is the 482 /// vectorization factor. 483 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 484 485 /// Set up the values of the IVs correctly when exiting the vector loop. 486 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 487 Value *CountRoundDown, Value *EndValue, 488 BasicBlock *MiddleBlock); 489 490 /// Create a new induction variable inside L. 491 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 492 Value *Step, Instruction *DL); 493 494 /// Handle all cross-iteration phis in the header. 495 void fixCrossIterationPHIs(); 496 497 /// Fix a first-order recurrence. This is the second phase of vectorizing 498 /// this phi node. 499 void fixFirstOrderRecurrence(PHINode *Phi); 500 501 /// Fix a reduction cross-iteration phi. This is the second phase of 502 /// vectorizing this phi node. 503 void fixReduction(PHINode *Phi); 504 505 /// The Loop exit block may have single value PHI nodes with some 506 /// incoming value. While vectorizing we only handled real values 507 /// that were defined inside the loop and we should have one value for 508 /// each predecessor of its parent basic block. See PR14725. 509 void fixLCSSAPHIs(); 510 511 /// Iteratively sink the scalarized operands of a predicated instruction into 512 /// the block that was created for it. 513 void sinkScalarOperands(Instruction *PredInst); 514 515 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 516 /// represented as. 517 void truncateToMinimalBitwidths(); 518 519 /// Insert the new loop to the loop hierarchy and pass manager 520 /// and update the analysis passes. 521 void updateAnalysis(); 522 523 /// Create a broadcast instruction. This method generates a broadcast 524 /// instruction (shuffle) for loop invariant values and for the induction 525 /// value. If this is the induction variable then we extend it to N, N+1, ... 526 /// this is needed because each iteration in the loop corresponds to a SIMD 527 /// element. 528 virtual Value *getBroadcastInstrs(Value *V); 529 530 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 531 /// to each vector element of Val. The sequence starts at StartIndex. 532 /// \p Opcode is relevant for FP induction variable. 533 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 534 Instruction::BinaryOps Opcode = 535 Instruction::BinaryOpsEnd); 536 537 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 538 /// variable on which to base the steps, \p Step is the size of the step, and 539 /// \p EntryVal is the value from the original loop that maps to the steps. 540 /// Note that \p EntryVal doesn't have to be an induction variable - it 541 /// can also be a truncate instruction. 542 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 543 const InductionDescriptor &ID); 544 545 /// Create a vector induction phi node based on an existing scalar one. \p 546 /// EntryVal is the value from the original loop that maps to the vector phi 547 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 548 /// truncate instruction, instead of widening the original IV, we widen a 549 /// version of the IV truncated to \p EntryVal's type. 550 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 551 Value *Step, Instruction *EntryVal); 552 553 /// Returns true if an instruction \p I should be scalarized instead of 554 /// vectorized for the chosen vectorization factor. 555 bool shouldScalarizeInstruction(Instruction *I) const; 556 557 /// Returns true if we should generate a scalar version of \p IV. 558 bool needsScalarInduction(Instruction *IV) const; 559 560 /// If there is a cast involved in the induction variable \p ID, which should 561 /// be ignored in the vectorized loop body, this function records the 562 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 563 /// cast. We had already proved that the casted Phi is equal to the uncasted 564 /// Phi in the vectorized loop (under a runtime guard), and therefore 565 /// there is no need to vectorize the cast - the same value can be used in the 566 /// vector loop for both the Phi and the cast. 567 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 568 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 569 /// 570 /// \p EntryVal is the value from the original loop that maps to the vector 571 /// phi node and is used to distinguish what is the IV currently being 572 /// processed - original one (if \p EntryVal is a phi corresponding to the 573 /// original IV) or the "newly-created" one based on the proof mentioned above 574 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 575 /// latter case \p EntryVal is a TruncInst and we must not record anything for 576 /// that IV, but it's error-prone to expect callers of this routine to care 577 /// about that, hence this explicit parameter. 578 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 579 const Instruction *EntryVal, 580 Value *VectorLoopValue, 581 unsigned Part, 582 unsigned Lane = UINT_MAX); 583 584 /// Generate a shuffle sequence that will reverse the vector Vec. 585 virtual Value *reverseVector(Value *Vec); 586 587 /// Returns (and creates if needed) the original loop trip count. 588 Value *getOrCreateTripCount(Loop *NewLoop); 589 590 /// Returns (and creates if needed) the trip count of the widened loop. 591 Value *getOrCreateVectorTripCount(Loop *NewLoop); 592 593 /// Returns a bitcasted value to the requested vector type. 594 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 595 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 596 const DataLayout &DL); 597 598 /// Emit a bypass check to see if the vector trip count is zero, including if 599 /// it overflows. 600 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 601 602 /// Emit a bypass check to see if all of the SCEV assumptions we've 603 /// had to make are correct. 604 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 605 606 /// Emit bypass checks to check any memory assumptions we may have made. 607 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 608 609 /// Compute the transformed value of Index at offset StartValue using step 610 /// StepValue. 611 /// For integer induction, returns StartValue + Index * StepValue. 612 /// For pointer induction, returns StartValue[Index * StepValue]. 613 /// FIXME: The newly created binary instructions should contain nsw/nuw 614 /// flags, which can be found from the original scalar operations. 615 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 616 const DataLayout &DL, 617 const InductionDescriptor &ID) const; 618 619 /// Add additional metadata to \p To that was not present on \p Orig. 620 /// 621 /// Currently this is used to add the noalias annotations based on the 622 /// inserted memchecks. Use this for instructions that are *cloned* into the 623 /// vector loop. 624 void addNewMetadata(Instruction *To, const Instruction *Orig); 625 626 /// Add metadata from one instruction to another. 627 /// 628 /// This includes both the original MDs from \p From and additional ones (\see 629 /// addNewMetadata). Use this for *newly created* instructions in the vector 630 /// loop. 631 void addMetadata(Instruction *To, Instruction *From); 632 633 /// Similar to the previous function but it adds the metadata to a 634 /// vector of instructions. 635 void addMetadata(ArrayRef<Value *> To, Instruction *From); 636 637 /// The original loop. 638 Loop *OrigLoop; 639 640 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 641 /// dynamic knowledge to simplify SCEV expressions and converts them to a 642 /// more usable form. 643 PredicatedScalarEvolution &PSE; 644 645 /// Loop Info. 646 LoopInfo *LI; 647 648 /// Dominator Tree. 649 DominatorTree *DT; 650 651 /// Alias Analysis. 652 AliasAnalysis *AA; 653 654 /// Target Library Info. 655 const TargetLibraryInfo *TLI; 656 657 /// Target Transform Info. 658 const TargetTransformInfo *TTI; 659 660 /// Assumption Cache. 661 AssumptionCache *AC; 662 663 /// Interface to emit optimization remarks. 664 OptimizationRemarkEmitter *ORE; 665 666 /// LoopVersioning. It's only set up (non-null) if memchecks were 667 /// used. 668 /// 669 /// This is currently only used to add no-alias metadata based on the 670 /// memchecks. The actually versioning is performed manually. 671 std::unique_ptr<LoopVersioning> LVer; 672 673 /// The vectorization SIMD factor to use. Each vector will have this many 674 /// vector elements. 675 unsigned VF; 676 677 /// The vectorization unroll factor to use. Each scalar is vectorized to this 678 /// many different vector instructions. 679 unsigned UF; 680 681 /// The builder that we use 682 IRBuilder<> Builder; 683 684 // --- Vectorization state --- 685 686 /// The vector-loop preheader. 687 BasicBlock *LoopVectorPreHeader; 688 689 /// The scalar-loop preheader. 690 BasicBlock *LoopScalarPreHeader; 691 692 /// Middle Block between the vector and the scalar. 693 BasicBlock *LoopMiddleBlock; 694 695 /// The ExitBlock of the scalar loop. 696 BasicBlock *LoopExitBlock; 697 698 /// The vector loop body. 699 BasicBlock *LoopVectorBody; 700 701 /// The scalar loop body. 702 BasicBlock *LoopScalarBody; 703 704 /// A list of all bypass blocks. The first block is the entry of the loop. 705 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 706 707 /// The new Induction variable which was added to the new block. 708 PHINode *Induction = nullptr; 709 710 /// The induction variable of the old basic block. 711 PHINode *OldInduction = nullptr; 712 713 /// Maps values from the original loop to their corresponding values in the 714 /// vectorized loop. A key value can map to either vector values, scalar 715 /// values or both kinds of values, depending on whether the key was 716 /// vectorized and scalarized. 717 VectorizerValueMap VectorLoopValueMap; 718 719 /// Store instructions that were predicated. 720 SmallVector<Instruction *, 4> PredicatedInstructions; 721 722 /// Trip count of the original loop. 723 Value *TripCount = nullptr; 724 725 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 726 Value *VectorTripCount = nullptr; 727 728 /// The legality analysis. 729 LoopVectorizationLegality *Legal; 730 731 /// The profitablity analysis. 732 LoopVectorizationCostModel *Cost; 733 734 // Record whether runtime checks are added. 735 bool AddedSafetyChecks = false; 736 737 // Holds the end values for each induction variable. We save the end values 738 // so we can later fix-up the external users of the induction variables. 739 DenseMap<PHINode *, Value *> IVEndValues; 740 741 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 742 // fixed up at the end of vector code generation. 743 SmallVector<PHINode *, 8> OrigPHIsToFix; 744 }; 745 746 class InnerLoopUnroller : public InnerLoopVectorizer { 747 public: 748 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 749 LoopInfo *LI, DominatorTree *DT, 750 const TargetLibraryInfo *TLI, 751 const TargetTransformInfo *TTI, AssumptionCache *AC, 752 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 753 LoopVectorizationLegality *LVL, 754 LoopVectorizationCostModel *CM) 755 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 756 UnrollFactor, LVL, CM) {} 757 758 private: 759 Value *getBroadcastInstrs(Value *V) override; 760 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 761 Instruction::BinaryOps Opcode = 762 Instruction::BinaryOpsEnd) override; 763 Value *reverseVector(Value *Vec) override; 764 }; 765 766 } // end namespace llvm 767 768 /// Look for a meaningful debug location on the instruction or it's 769 /// operands. 770 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 771 if (!I) 772 return I; 773 774 DebugLoc Empty; 775 if (I->getDebugLoc() != Empty) 776 return I; 777 778 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 779 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 780 if (OpInst->getDebugLoc() != Empty) 781 return OpInst; 782 } 783 784 return I; 785 } 786 787 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 788 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 789 const DILocation *DIL = Inst->getDebugLoc(); 790 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 791 !isa<DbgInfoIntrinsic>(Inst)) { 792 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 793 if (NewDIL) 794 B.SetCurrentDebugLocation(NewDIL.getValue()); 795 else 796 LLVM_DEBUG(dbgs() 797 << "Failed to create new discriminator: " 798 << DIL->getFilename() << " Line: " << DIL->getLine()); 799 } 800 else 801 B.SetCurrentDebugLocation(DIL); 802 } else 803 B.SetCurrentDebugLocation(DebugLoc()); 804 } 805 806 /// Write a record \p DebugMsg about vectorization failure to the debug 807 /// output stream. If \p I is passed, it is an instruction that prevents 808 /// vectorization. 809 #ifndef NDEBUG 810 static void debugVectorizationFailure(const StringRef DebugMsg, 811 Instruction *I) { 812 dbgs() << "LV: Not vectorizing: " << DebugMsg; 813 if (I != nullptr) 814 dbgs() << " " << *I; 815 else 816 dbgs() << '.'; 817 dbgs() << '\n'; 818 } 819 #endif 820 821 /// Create an analysis remark that explains why vectorization failed 822 /// 823 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 824 /// RemarkName is the identifier for the remark. If \p I is passed it is an 825 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 826 /// the location of the remark. \return the remark object that can be 827 /// streamed to. 828 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 829 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 830 Value *CodeRegion = TheLoop->getHeader(); 831 DebugLoc DL = TheLoop->getStartLoc(); 832 833 if (I) { 834 CodeRegion = I->getParent(); 835 // If there is no debug location attached to the instruction, revert back to 836 // using the loop's. 837 if (I->getDebugLoc()) 838 DL = I->getDebugLoc(); 839 } 840 841 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 842 R << "loop not vectorized: "; 843 return R; 844 } 845 846 namespace llvm { 847 848 void reportVectorizationFailure(const StringRef DebugMsg, 849 const StringRef OREMsg, const StringRef ORETag, 850 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 851 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 852 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 853 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 854 ORETag, TheLoop, I) << OREMsg); 855 } 856 857 } // end namespace llvm 858 859 #ifndef NDEBUG 860 /// \return string containing a file name and a line # for the given loop. 861 static std::string getDebugLocString(const Loop *L) { 862 std::string Result; 863 if (L) { 864 raw_string_ostream OS(Result); 865 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 866 LoopDbgLoc.print(OS); 867 else 868 // Just print the module name. 869 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 870 OS.flush(); 871 } 872 return Result; 873 } 874 #endif 875 876 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 877 const Instruction *Orig) { 878 // If the loop was versioned with memchecks, add the corresponding no-alias 879 // metadata. 880 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 881 LVer->annotateInstWithNoAlias(To, Orig); 882 } 883 884 void InnerLoopVectorizer::addMetadata(Instruction *To, 885 Instruction *From) { 886 propagateMetadata(To, From); 887 addNewMetadata(To, From); 888 } 889 890 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 891 Instruction *From) { 892 for (Value *V : To) { 893 if (Instruction *I = dyn_cast<Instruction>(V)) 894 addMetadata(I, From); 895 } 896 } 897 898 namespace llvm { 899 900 // Loop vectorization cost-model hints how the scalar epilogue loop should be 901 // lowered. 902 enum ScalarEpilogueLowering { 903 904 // The default: allowing scalar epilogues. 905 CM_ScalarEpilogueAllowed, 906 907 // Vectorization with OptForSize: don't allow epilogues. 908 CM_ScalarEpilogueNotAllowedOptSize, 909 910 // A special case of vectorisation with OptForSize: loops with a very small 911 // trip count are considered for vectorization under OptForSize, thereby 912 // making sure the cost of their loop body is dominant, free of runtime 913 // guards and scalar iteration overheads. 914 CM_ScalarEpilogueNotAllowedLowTripLoop, 915 916 // Loop hint predicate indicating an epilogue is undesired. 917 CM_ScalarEpilogueNotNeededUsePredicate 918 }; 919 920 /// LoopVectorizationCostModel - estimates the expected speedups due to 921 /// vectorization. 922 /// In many cases vectorization is not profitable. This can happen because of 923 /// a number of reasons. In this class we mainly attempt to predict the 924 /// expected speedup/slowdowns due to the supported instruction set. We use the 925 /// TargetTransformInfo to query the different backends for the cost of 926 /// different operations. 927 class LoopVectorizationCostModel { 928 public: 929 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 930 PredicatedScalarEvolution &PSE, LoopInfo *LI, 931 LoopVectorizationLegality *Legal, 932 const TargetTransformInfo &TTI, 933 const TargetLibraryInfo *TLI, DemandedBits *DB, 934 AssumptionCache *AC, 935 OptimizationRemarkEmitter *ORE, const Function *F, 936 const LoopVectorizeHints *Hints, 937 InterleavedAccessInfo &IAI) 938 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 939 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 940 Hints(Hints), InterleaveInfo(IAI) {} 941 942 /// \return An upper bound for the vectorization factor, or None if 943 /// vectorization and interleaving should be avoided up front. 944 Optional<unsigned> computeMaxVF(); 945 946 /// \return True if runtime checks are required for vectorization, and false 947 /// otherwise. 948 bool runtimeChecksRequired(); 949 950 /// \return The most profitable vectorization factor and the cost of that VF. 951 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 952 /// then this vectorization factor will be selected if vectorization is 953 /// possible. 954 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 955 956 /// Setup cost-based decisions for user vectorization factor. 957 void selectUserVectorizationFactor(unsigned UserVF) { 958 collectUniformsAndScalars(UserVF); 959 collectInstsToScalarize(UserVF); 960 } 961 962 /// \return The size (in bits) of the smallest and widest types in the code 963 /// that needs to be vectorized. We ignore values that remain scalar such as 964 /// 64 bit loop indices. 965 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 966 967 /// \return The desired interleave count. 968 /// If interleave count has been specified by metadata it will be returned. 969 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 970 /// are the selected vectorization factor and the cost of the selected VF. 971 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 972 973 /// Memory access instruction may be vectorized in more than one way. 974 /// Form of instruction after vectorization depends on cost. 975 /// This function takes cost-based decisions for Load/Store instructions 976 /// and collects them in a map. This decisions map is used for building 977 /// the lists of loop-uniform and loop-scalar instructions. 978 /// The calculated cost is saved with widening decision in order to 979 /// avoid redundant calculations. 980 void setCostBasedWideningDecision(unsigned VF); 981 982 /// A struct that represents some properties of the register usage 983 /// of a loop. 984 struct RegisterUsage { 985 /// Holds the number of loop invariant values that are used in the loop. 986 unsigned LoopInvariantRegs; 987 988 /// Holds the maximum number of concurrent live intervals in the loop. 989 unsigned MaxLocalUsers; 990 }; 991 992 /// \return Returns information about the register usages of the loop for the 993 /// given vectorization factors. 994 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 995 996 /// Collect values we want to ignore in the cost model. 997 void collectValuesToIgnore(); 998 999 /// \returns The smallest bitwidth each instruction can be represented with. 1000 /// The vector equivalents of these instructions should be truncated to this 1001 /// type. 1002 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1003 return MinBWs; 1004 } 1005 1006 /// \returns True if it is more profitable to scalarize instruction \p I for 1007 /// vectorization factor \p VF. 1008 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1009 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1010 1011 // Cost model is not run in the VPlan-native path - return conservative 1012 // result until this changes. 1013 if (EnableVPlanNativePath) 1014 return false; 1015 1016 auto Scalars = InstsToScalarize.find(VF); 1017 assert(Scalars != InstsToScalarize.end() && 1018 "VF not yet analyzed for scalarization profitability"); 1019 return Scalars->second.find(I) != Scalars->second.end(); 1020 } 1021 1022 /// Returns true if \p I is known to be uniform after vectorization. 1023 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1024 if (VF == 1) 1025 return true; 1026 1027 // Cost model is not run in the VPlan-native path - return conservative 1028 // result until this changes. 1029 if (EnableVPlanNativePath) 1030 return false; 1031 1032 auto UniformsPerVF = Uniforms.find(VF); 1033 assert(UniformsPerVF != Uniforms.end() && 1034 "VF not yet analyzed for uniformity"); 1035 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); 1036 } 1037 1038 /// Returns true if \p I is known to be scalar after vectorization. 1039 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1040 if (VF == 1) 1041 return true; 1042 1043 // Cost model is not run in the VPlan-native path - return conservative 1044 // result until this changes. 1045 if (EnableVPlanNativePath) 1046 return false; 1047 1048 auto ScalarsPerVF = Scalars.find(VF); 1049 assert(ScalarsPerVF != Scalars.end() && 1050 "Scalar values are not calculated for VF"); 1051 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); 1052 } 1053 1054 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1055 /// for vectorization factor \p VF. 1056 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1057 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1058 !isProfitableToScalarize(I, VF) && 1059 !isScalarAfterVectorization(I, VF); 1060 } 1061 1062 /// Decision that was taken during cost calculation for memory instruction. 1063 enum InstWidening { 1064 CM_Unknown, 1065 CM_Widen, // For consecutive accesses with stride +1. 1066 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1067 CM_Interleave, 1068 CM_GatherScatter, 1069 CM_Scalarize 1070 }; 1071 1072 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1073 /// instruction \p I and vector width \p VF. 1074 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1075 unsigned Cost) { 1076 assert(VF >= 2 && "Expected VF >=2"); 1077 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1078 } 1079 1080 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1081 /// interleaving group \p Grp and vector width \p VF. 1082 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1083 InstWidening W, unsigned Cost) { 1084 assert(VF >= 2 && "Expected VF >=2"); 1085 /// Broadcast this decicion to all instructions inside the group. 1086 /// But the cost will be assigned to one instruction only. 1087 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1088 if (auto *I = Grp->getMember(i)) { 1089 if (Grp->getInsertPos() == I) 1090 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1091 else 1092 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1093 } 1094 } 1095 } 1096 1097 /// Return the cost model decision for the given instruction \p I and vector 1098 /// width \p VF. Return CM_Unknown if this instruction did not pass 1099 /// through the cost modeling. 1100 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1101 assert(VF >= 2 && "Expected VF >=2"); 1102 1103 // Cost model is not run in the VPlan-native path - return conservative 1104 // result until this changes. 1105 if (EnableVPlanNativePath) 1106 return CM_GatherScatter; 1107 1108 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1109 auto Itr = WideningDecisions.find(InstOnVF); 1110 if (Itr == WideningDecisions.end()) 1111 return CM_Unknown; 1112 return Itr->second.first; 1113 } 1114 1115 /// Return the vectorization cost for the given instruction \p I and vector 1116 /// width \p VF. 1117 unsigned getWideningCost(Instruction *I, unsigned VF) { 1118 assert(VF >= 2 && "Expected VF >=2"); 1119 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1120 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1121 "The cost is not calculated"); 1122 return WideningDecisions[InstOnVF].second; 1123 } 1124 1125 /// Return True if instruction \p I is an optimizable truncate whose operand 1126 /// is an induction variable. Such a truncate will be removed by adding a new 1127 /// induction variable with the destination type. 1128 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1129 // If the instruction is not a truncate, return false. 1130 auto *Trunc = dyn_cast<TruncInst>(I); 1131 if (!Trunc) 1132 return false; 1133 1134 // Get the source and destination types of the truncate. 1135 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1136 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1137 1138 // If the truncate is free for the given types, return false. Replacing a 1139 // free truncate with an induction variable would add an induction variable 1140 // update instruction to each iteration of the loop. We exclude from this 1141 // check the primary induction variable since it will need an update 1142 // instruction regardless. 1143 Value *Op = Trunc->getOperand(0); 1144 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1145 return false; 1146 1147 // If the truncated value is not an induction variable, return false. 1148 return Legal->isInductionPhi(Op); 1149 } 1150 1151 /// Collects the instructions to scalarize for each predicated instruction in 1152 /// the loop. 1153 void collectInstsToScalarize(unsigned VF); 1154 1155 /// Collect Uniform and Scalar values for the given \p VF. 1156 /// The sets depend on CM decision for Load/Store instructions 1157 /// that may be vectorized as interleave, gather-scatter or scalarized. 1158 void collectUniformsAndScalars(unsigned VF) { 1159 // Do the analysis once. 1160 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1161 return; 1162 setCostBasedWideningDecision(VF); 1163 collectLoopUniforms(VF); 1164 collectLoopScalars(VF); 1165 } 1166 1167 /// Returns true if the target machine supports masked store operation 1168 /// for the given \p DataType and kind of access to \p Ptr. 1169 bool isLegalMaskedStore(Type *DataType, Value *Ptr) { 1170 return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType); 1171 } 1172 1173 /// Returns true if the target machine supports masked load operation 1174 /// for the given \p DataType and kind of access to \p Ptr. 1175 bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { 1176 return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType); 1177 } 1178 1179 /// Returns true if the target machine supports masked scatter operation 1180 /// for the given \p DataType. 1181 bool isLegalMaskedScatter(Type *DataType) { 1182 return TTI.isLegalMaskedScatter(DataType); 1183 } 1184 1185 /// Returns true if the target machine supports masked gather operation 1186 /// for the given \p DataType. 1187 bool isLegalMaskedGather(Type *DataType) { 1188 return TTI.isLegalMaskedGather(DataType); 1189 } 1190 1191 /// Returns true if the target machine can represent \p V as a masked gather 1192 /// or scatter operation. 1193 bool isLegalGatherOrScatter(Value *V) { 1194 bool LI = isa<LoadInst>(V); 1195 bool SI = isa<StoreInst>(V); 1196 if (!LI && !SI) 1197 return false; 1198 auto *Ty = getMemInstValueType(V); 1199 return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); 1200 } 1201 1202 /// Returns true if \p I is an instruction that will be scalarized with 1203 /// predication. Such instructions include conditional stores and 1204 /// instructions that may divide by zero. 1205 /// If a non-zero VF has been calculated, we check if I will be scalarized 1206 /// predication for that VF. 1207 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1208 1209 // Returns true if \p I is an instruction that will be predicated either 1210 // through scalar predication or masked load/store or masked gather/scatter. 1211 // Superset of instructions that return true for isScalarWithPredication. 1212 bool isPredicatedInst(Instruction *I) { 1213 if (!blockNeedsPredication(I->getParent())) 1214 return false; 1215 // Loads and stores that need some form of masked operation are predicated 1216 // instructions. 1217 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1218 return Legal->isMaskRequired(I); 1219 return isScalarWithPredication(I); 1220 } 1221 1222 /// Returns true if \p I is a memory instruction with consecutive memory 1223 /// access that can be widened. 1224 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1225 1226 /// Returns true if \p I is a memory instruction in an interleaved-group 1227 /// of memory accesses that can be vectorized with wide vector loads/stores 1228 /// and shuffles. 1229 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1230 1231 /// Check if \p Instr belongs to any interleaved access group. 1232 bool isAccessInterleaved(Instruction *Instr) { 1233 return InterleaveInfo.isInterleaved(Instr); 1234 } 1235 1236 /// Get the interleaved access group that \p Instr belongs to. 1237 const InterleaveGroup<Instruction> * 1238 getInterleavedAccessGroup(Instruction *Instr) { 1239 return InterleaveInfo.getInterleaveGroup(Instr); 1240 } 1241 1242 /// Returns true if an interleaved group requires a scalar iteration 1243 /// to handle accesses with gaps, and there is nothing preventing us from 1244 /// creating a scalar epilogue. 1245 bool requiresScalarEpilogue() const { 1246 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1247 } 1248 1249 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1250 /// loop hint annotation. 1251 bool isScalarEpilogueAllowed() const { 1252 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1253 } 1254 1255 /// Returns true if all loop blocks should be masked to fold tail loop. 1256 bool foldTailByMasking() const { return FoldTailByMasking; } 1257 1258 bool blockNeedsPredication(BasicBlock *BB) { 1259 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1260 } 1261 1262 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1263 /// with factor VF. Return the cost of the instruction, including 1264 /// scalarization overhead if it's needed. 1265 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1266 1267 /// Estimate cost of a call instruction CI if it were vectorized with factor 1268 /// VF. Return the cost of the instruction, including scalarization overhead 1269 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1270 /// scalarized - 1271 /// i.e. either vector version isn't available, or is too expensive. 1272 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1273 1274 private: 1275 unsigned NumPredStores = 0; 1276 1277 /// \return An upper bound for the vectorization factor, larger than zero. 1278 /// One is returned if vectorization should best be avoided due to cost. 1279 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1280 1281 /// The vectorization cost is a combination of the cost itself and a boolean 1282 /// indicating whether any of the contributing operations will actually 1283 /// operate on 1284 /// vector values after type legalization in the backend. If this latter value 1285 /// is 1286 /// false, then all operations will be scalarized (i.e. no vectorization has 1287 /// actually taken place). 1288 using VectorizationCostTy = std::pair<unsigned, bool>; 1289 1290 /// Returns the expected execution cost. The unit of the cost does 1291 /// not matter because we use the 'cost' units to compare different 1292 /// vector widths. The cost that is returned is *not* normalized by 1293 /// the factor width. 1294 VectorizationCostTy expectedCost(unsigned VF); 1295 1296 /// Returns the execution time cost of an instruction for a given vector 1297 /// width. Vector width of one means scalar. 1298 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1299 1300 /// The cost-computation logic from getInstructionCost which provides 1301 /// the vector type as an output parameter. 1302 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1303 1304 /// Calculate vectorization cost of memory instruction \p I. 1305 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1306 1307 /// The cost computation for scalarized memory instruction. 1308 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1309 1310 /// The cost computation for interleaving group of memory instructions. 1311 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1312 1313 /// The cost computation for Gather/Scatter instruction. 1314 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1315 1316 /// The cost computation for widening instruction \p I with consecutive 1317 /// memory access. 1318 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1319 1320 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1321 /// Load: scalar load + broadcast. 1322 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1323 /// element) 1324 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1325 1326 /// Estimate the overhead of scalarizing an instruction. This is a 1327 /// convenience wrapper for the type-based getScalarizationOverhead API. 1328 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1329 1330 /// Returns whether the instruction is a load or store and will be a emitted 1331 /// as a vector operation. 1332 bool isConsecutiveLoadOrStore(Instruction *I); 1333 1334 /// Returns true if an artificially high cost for emulated masked memrefs 1335 /// should be used. 1336 bool useEmulatedMaskMemRefHack(Instruction *I); 1337 1338 /// Map of scalar integer values to the smallest bitwidth they can be legally 1339 /// represented as. The vector equivalents of these values should be truncated 1340 /// to this type. 1341 MapVector<Instruction *, uint64_t> MinBWs; 1342 1343 /// A type representing the costs for instructions if they were to be 1344 /// scalarized rather than vectorized. The entries are Instruction-Cost 1345 /// pairs. 1346 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1347 1348 /// A set containing all BasicBlocks that are known to present after 1349 /// vectorization as a predicated block. 1350 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1351 1352 /// Records whether it is allowed to have the original scalar loop execute at 1353 /// least once. This may be needed as a fallback loop in case runtime 1354 /// aliasing/dependence checks fail, or to handle the tail/remainder 1355 /// iterations when the trip count is unknown or doesn't divide by the VF, 1356 /// or as a peel-loop to handle gaps in interleave-groups. 1357 /// Under optsize and when the trip count is very small we don't allow any 1358 /// iterations to execute in the scalar loop. 1359 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1360 1361 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1362 bool FoldTailByMasking = false; 1363 1364 /// A map holding scalar costs for different vectorization factors. The 1365 /// presence of a cost for an instruction in the mapping indicates that the 1366 /// instruction will be scalarized when vectorizing with the associated 1367 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1368 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1369 1370 /// Holds the instructions known to be uniform after vectorization. 1371 /// The data is collected per VF. 1372 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1373 1374 /// Holds the instructions known to be scalar after vectorization. 1375 /// The data is collected per VF. 1376 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1377 1378 /// Holds the instructions (address computations) that are forced to be 1379 /// scalarized. 1380 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1381 1382 /// Returns the expected difference in cost from scalarizing the expression 1383 /// feeding a predicated instruction \p PredInst. The instructions to 1384 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1385 /// non-negative return value implies the expression will be scalarized. 1386 /// Currently, only single-use chains are considered for scalarization. 1387 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1388 unsigned VF); 1389 1390 /// Collect the instructions that are uniform after vectorization. An 1391 /// instruction is uniform if we represent it with a single scalar value in 1392 /// the vectorized loop corresponding to each vector iteration. Examples of 1393 /// uniform instructions include pointer operands of consecutive or 1394 /// interleaved memory accesses. Note that although uniformity implies an 1395 /// instruction will be scalar, the reverse is not true. In general, a 1396 /// scalarized instruction will be represented by VF scalar values in the 1397 /// vectorized loop, each corresponding to an iteration of the original 1398 /// scalar loop. 1399 void collectLoopUniforms(unsigned VF); 1400 1401 /// Collect the instructions that are scalar after vectorization. An 1402 /// instruction is scalar if it is known to be uniform or will be scalarized 1403 /// during vectorization. Non-uniform scalarized instructions will be 1404 /// represented by VF values in the vectorized loop, each corresponding to an 1405 /// iteration of the original scalar loop. 1406 void collectLoopScalars(unsigned VF); 1407 1408 /// Keeps cost model vectorization decision and cost for instructions. 1409 /// Right now it is used for memory instructions only. 1410 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1411 std::pair<InstWidening, unsigned>>; 1412 1413 DecisionList WideningDecisions; 1414 1415 /// Returns true if \p V is expected to be vectorized and it needs to be 1416 /// extracted. 1417 bool needsExtract(Value *V, unsigned VF) const { 1418 Instruction *I = dyn_cast<Instruction>(V); 1419 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1420 return false; 1421 1422 // Assume we can vectorize V (and hence we need extraction) if the 1423 // scalars are not computed yet. This can happen, because it is called 1424 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1425 // the scalars are collected. That should be a safe assumption in most 1426 // cases, because we check if the operands have vectorizable types 1427 // beforehand in LoopVectorizationLegality. 1428 return Scalars.find(VF) == Scalars.end() || 1429 !isScalarAfterVectorization(I, VF); 1430 }; 1431 1432 /// Returns a range containing only operands needing to be extracted. 1433 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1434 unsigned VF) { 1435 return SmallVector<Value *, 4>(make_filter_range( 1436 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1437 } 1438 1439 public: 1440 /// The loop that we evaluate. 1441 Loop *TheLoop; 1442 1443 /// Predicated scalar evolution analysis. 1444 PredicatedScalarEvolution &PSE; 1445 1446 /// Loop Info analysis. 1447 LoopInfo *LI; 1448 1449 /// Vectorization legality. 1450 LoopVectorizationLegality *Legal; 1451 1452 /// Vector target information. 1453 const TargetTransformInfo &TTI; 1454 1455 /// Target Library Info. 1456 const TargetLibraryInfo *TLI; 1457 1458 /// Demanded bits analysis. 1459 DemandedBits *DB; 1460 1461 /// Assumption cache. 1462 AssumptionCache *AC; 1463 1464 /// Interface to emit optimization remarks. 1465 OptimizationRemarkEmitter *ORE; 1466 1467 const Function *TheFunction; 1468 1469 /// Loop Vectorize Hint. 1470 const LoopVectorizeHints *Hints; 1471 1472 /// The interleave access information contains groups of interleaved accesses 1473 /// with the same stride and close to each other. 1474 InterleavedAccessInfo &InterleaveInfo; 1475 1476 /// Values to ignore in the cost model. 1477 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1478 1479 /// Values to ignore in the cost model when VF > 1. 1480 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1481 }; 1482 1483 } // end namespace llvm 1484 1485 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1486 // vectorization. The loop needs to be annotated with #pragma omp simd 1487 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1488 // vector length information is not provided, vectorization is not considered 1489 // explicit. Interleave hints are not allowed either. These limitations will be 1490 // relaxed in the future. 1491 // Please, note that we are currently forced to abuse the pragma 'clang 1492 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1493 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1494 // provides *explicit vectorization hints* (LV can bypass legal checks and 1495 // assume that vectorization is legal). However, both hints are implemented 1496 // using the same metadata (llvm.loop.vectorize, processed by 1497 // LoopVectorizeHints). This will be fixed in the future when the native IR 1498 // representation for pragma 'omp simd' is introduced. 1499 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1500 OptimizationRemarkEmitter *ORE) { 1501 assert(!OuterLp->empty() && "This is not an outer loop"); 1502 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1503 1504 // Only outer loops with an explicit vectorization hint are supported. 1505 // Unannotated outer loops are ignored. 1506 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1507 return false; 1508 1509 Function *Fn = OuterLp->getHeader()->getParent(); 1510 if (!Hints.allowVectorization(Fn, OuterLp, 1511 true /*VectorizeOnlyWhenForced*/)) { 1512 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1513 return false; 1514 } 1515 1516 if (Hints.getInterleave() > 1) { 1517 // TODO: Interleave support is future work. 1518 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1519 "outer loops.\n"); 1520 Hints.emitRemarkWithHints(); 1521 return false; 1522 } 1523 1524 return true; 1525 } 1526 1527 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1528 OptimizationRemarkEmitter *ORE, 1529 SmallVectorImpl<Loop *> &V) { 1530 // Collect inner loops and outer loops without irreducible control flow. For 1531 // now, only collect outer loops that have explicit vectorization hints. If we 1532 // are stress testing the VPlan H-CFG construction, we collect the outermost 1533 // loop of every loop nest. 1534 if (L.empty() || VPlanBuildStressTest || 1535 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1536 LoopBlocksRPO RPOT(&L); 1537 RPOT.perform(LI); 1538 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1539 V.push_back(&L); 1540 // TODO: Collect inner loops inside marked outer loops in case 1541 // vectorization fails for the outer loop. Do not invoke 1542 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1543 // already known to be reducible. We can use an inherited attribute for 1544 // that. 1545 return; 1546 } 1547 } 1548 for (Loop *InnerL : L) 1549 collectSupportedLoops(*InnerL, LI, ORE, V); 1550 } 1551 1552 namespace { 1553 1554 /// The LoopVectorize Pass. 1555 struct LoopVectorize : public FunctionPass { 1556 /// Pass identification, replacement for typeid 1557 static char ID; 1558 1559 LoopVectorizePass Impl; 1560 1561 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1562 bool VectorizeOnlyWhenForced = false) 1563 : FunctionPass(ID) { 1564 Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; 1565 Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; 1566 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1567 } 1568 1569 bool runOnFunction(Function &F) override { 1570 if (skipFunction(F)) 1571 return false; 1572 1573 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1574 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1575 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1576 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1577 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1578 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1579 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1580 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1581 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1582 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1583 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1584 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1585 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1586 1587 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1588 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1589 1590 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1591 GetLAA, *ORE, PSI); 1592 } 1593 1594 void getAnalysisUsage(AnalysisUsage &AU) const override { 1595 AU.addRequired<AssumptionCacheTracker>(); 1596 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1597 AU.addRequired<DominatorTreeWrapperPass>(); 1598 AU.addRequired<LoopInfoWrapperPass>(); 1599 AU.addRequired<ScalarEvolutionWrapperPass>(); 1600 AU.addRequired<TargetTransformInfoWrapperPass>(); 1601 AU.addRequired<AAResultsWrapperPass>(); 1602 AU.addRequired<LoopAccessLegacyAnalysis>(); 1603 AU.addRequired<DemandedBitsWrapperPass>(); 1604 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1605 1606 // We currently do not preserve loopinfo/dominator analyses with outer loop 1607 // vectorization. Until this is addressed, mark these analyses as preserved 1608 // only for non-VPlan-native path. 1609 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1610 if (!EnableVPlanNativePath) { 1611 AU.addPreserved<LoopInfoWrapperPass>(); 1612 AU.addPreserved<DominatorTreeWrapperPass>(); 1613 } 1614 1615 AU.addPreserved<BasicAAWrapperPass>(); 1616 AU.addPreserved<GlobalsAAWrapperPass>(); 1617 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1618 } 1619 }; 1620 1621 } // end anonymous namespace 1622 1623 //===----------------------------------------------------------------------===// 1624 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1625 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1626 //===----------------------------------------------------------------------===// 1627 1628 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1629 // We need to place the broadcast of invariant variables outside the loop, 1630 // but only if it's proven safe to do so. Else, broadcast will be inside 1631 // vector loop body. 1632 Instruction *Instr = dyn_cast<Instruction>(V); 1633 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1634 (!Instr || 1635 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1636 // Place the code for broadcasting invariant variables in the new preheader. 1637 IRBuilder<>::InsertPointGuard Guard(Builder); 1638 if (SafeToHoist) 1639 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1640 1641 // Broadcast the scalar into all locations in the vector. 1642 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1643 1644 return Shuf; 1645 } 1646 1647 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1648 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1649 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1650 "Expected either an induction phi-node or a truncate of it!"); 1651 Value *Start = II.getStartValue(); 1652 1653 // Construct the initial value of the vector IV in the vector loop preheader 1654 auto CurrIP = Builder.saveIP(); 1655 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1656 if (isa<TruncInst>(EntryVal)) { 1657 assert(Start->getType()->isIntegerTy() && 1658 "Truncation requires an integer type"); 1659 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1660 Step = Builder.CreateTrunc(Step, TruncType); 1661 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1662 } 1663 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1664 Value *SteppedStart = 1665 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1666 1667 // We create vector phi nodes for both integer and floating-point induction 1668 // variables. Here, we determine the kind of arithmetic we will perform. 1669 Instruction::BinaryOps AddOp; 1670 Instruction::BinaryOps MulOp; 1671 if (Step->getType()->isIntegerTy()) { 1672 AddOp = Instruction::Add; 1673 MulOp = Instruction::Mul; 1674 } else { 1675 AddOp = II.getInductionOpcode(); 1676 MulOp = Instruction::FMul; 1677 } 1678 1679 // Multiply the vectorization factor by the step using integer or 1680 // floating-point arithmetic as appropriate. 1681 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1682 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1683 1684 // Create a vector splat to use in the induction update. 1685 // 1686 // FIXME: If the step is non-constant, we create the vector splat with 1687 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1688 // handle a constant vector splat. 1689 Value *SplatVF = isa<Constant>(Mul) 1690 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 1691 : Builder.CreateVectorSplat(VF, Mul); 1692 Builder.restoreIP(CurrIP); 1693 1694 // We may need to add the step a number of times, depending on the unroll 1695 // factor. The last of those goes into the PHI. 1696 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1697 &*LoopVectorBody->getFirstInsertionPt()); 1698 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1699 Instruction *LastInduction = VecInd; 1700 for (unsigned Part = 0; Part < UF; ++Part) { 1701 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1702 1703 if (isa<TruncInst>(EntryVal)) 1704 addMetadata(LastInduction, EntryVal); 1705 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1706 1707 LastInduction = cast<Instruction>(addFastMathFlag( 1708 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1709 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1710 } 1711 1712 // Move the last step to the end of the latch block. This ensures consistent 1713 // placement of all induction updates. 1714 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1715 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1716 auto *ICmp = cast<Instruction>(Br->getCondition()); 1717 LastInduction->moveBefore(ICmp); 1718 LastInduction->setName("vec.ind.next"); 1719 1720 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1721 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1722 } 1723 1724 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1725 return Cost->isScalarAfterVectorization(I, VF) || 1726 Cost->isProfitableToScalarize(I, VF); 1727 } 1728 1729 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1730 if (shouldScalarizeInstruction(IV)) 1731 return true; 1732 auto isScalarInst = [&](User *U) -> bool { 1733 auto *I = cast<Instruction>(U); 1734 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1735 }; 1736 return llvm::any_of(IV->users(), isScalarInst); 1737 } 1738 1739 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1740 const InductionDescriptor &ID, const Instruction *EntryVal, 1741 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1742 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1743 "Expected either an induction phi-node or a truncate of it!"); 1744 1745 // This induction variable is not the phi from the original loop but the 1746 // newly-created IV based on the proof that casted Phi is equal to the 1747 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1748 // re-uses the same InductionDescriptor that original IV uses but we don't 1749 // have to do any recording in this case - that is done when original IV is 1750 // processed. 1751 if (isa<TruncInst>(EntryVal)) 1752 return; 1753 1754 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1755 if (Casts.empty()) 1756 return; 1757 // Only the first Cast instruction in the Casts vector is of interest. 1758 // The rest of the Casts (if exist) have no uses outside the 1759 // induction update chain itself. 1760 Instruction *CastInst = *Casts.begin(); 1761 if (Lane < UINT_MAX) 1762 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1763 else 1764 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1765 } 1766 1767 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1768 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1769 "Primary induction variable must have an integer type"); 1770 1771 auto II = Legal->getInductionVars()->find(IV); 1772 assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); 1773 1774 auto ID = II->second; 1775 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1776 1777 // The scalar value to broadcast. This will be derived from the canonical 1778 // induction variable. 1779 Value *ScalarIV = nullptr; 1780 1781 // The value from the original loop to which we are mapping the new induction 1782 // variable. 1783 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1784 1785 // True if we have vectorized the induction variable. 1786 auto VectorizedIV = false; 1787 1788 // Determine if we want a scalar version of the induction variable. This is 1789 // true if the induction variable itself is not widened, or if it has at 1790 // least one user in the loop that is not widened. 1791 auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); 1792 1793 // Generate code for the induction step. Note that induction steps are 1794 // required to be loop-invariant 1795 assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && 1796 "Induction step should be loop invariant"); 1797 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1798 Value *Step = nullptr; 1799 if (PSE.getSE()->isSCEVable(IV->getType())) { 1800 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1801 Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), 1802 LoopVectorPreHeader->getTerminator()); 1803 } else { 1804 Step = cast<SCEVUnknown>(ID.getStep())->getValue(); 1805 } 1806 1807 // Try to create a new independent vector induction variable. If we can't 1808 // create the phi node, we will splat the scalar induction variable in each 1809 // loop iteration. 1810 if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) { 1811 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1812 VectorizedIV = true; 1813 } 1814 1815 // If we haven't yet vectorized the induction variable, or if we will create 1816 // a scalar one, we need to define the scalar induction variable and step 1817 // values. If we were given a truncation type, truncate the canonical 1818 // induction variable and step. Otherwise, derive these values from the 1819 // induction descriptor. 1820 if (!VectorizedIV || NeedsScalarIV) { 1821 ScalarIV = Induction; 1822 if (IV != OldInduction) { 1823 ScalarIV = IV->getType()->isIntegerTy() 1824 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1825 : Builder.CreateCast(Instruction::SIToFP, Induction, 1826 IV->getType()); 1827 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1828 ScalarIV->setName("offset.idx"); 1829 } 1830 if (Trunc) { 1831 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1832 assert(Step->getType()->isIntegerTy() && 1833 "Truncation requires an integer step"); 1834 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1835 Step = Builder.CreateTrunc(Step, TruncType); 1836 } 1837 } 1838 1839 // If we haven't yet vectorized the induction variable, splat the scalar 1840 // induction variable, and build the necessary step vectors. 1841 // TODO: Don't do it unless the vectorized IV is really required. 1842 if (!VectorizedIV) { 1843 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1844 for (unsigned Part = 0; Part < UF; ++Part) { 1845 Value *EntryPart = 1846 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1847 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1848 if (Trunc) 1849 addMetadata(EntryPart, Trunc); 1850 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1851 } 1852 } 1853 1854 // If an induction variable is only used for counting loop iterations or 1855 // calculating addresses, it doesn't need to be widened. Create scalar steps 1856 // that can be used by instructions we will later scalarize. Note that the 1857 // addition of the scalar steps will not increase the number of instructions 1858 // in the loop in the common case prior to InstCombine. We will be trading 1859 // one vector extract for each scalar step. 1860 if (NeedsScalarIV) 1861 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1862 } 1863 1864 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1865 Instruction::BinaryOps BinOp) { 1866 // Create and check the types. 1867 assert(Val->getType()->isVectorTy() && "Must be a vector"); 1868 int VLen = Val->getType()->getVectorNumElements(); 1869 1870 Type *STy = Val->getType()->getScalarType(); 1871 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1872 "Induction Step must be an integer or FP"); 1873 assert(Step->getType() == STy && "Step has wrong type"); 1874 1875 SmallVector<Constant *, 8> Indices; 1876 1877 if (STy->isIntegerTy()) { 1878 // Create a vector of consecutive numbers from zero to VF. 1879 for (int i = 0; i < VLen; ++i) 1880 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1881 1882 // Add the consecutive indices to the vector value. 1883 Constant *Cv = ConstantVector::get(Indices); 1884 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1885 Step = Builder.CreateVectorSplat(VLen, Step); 1886 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1887 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1888 // which can be found from the original scalar operations. 1889 Step = Builder.CreateMul(Cv, Step); 1890 return Builder.CreateAdd(Val, Step, "induction"); 1891 } 1892 1893 // Floating point induction. 1894 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1895 "Binary Opcode should be specified for FP induction"); 1896 // Create a vector of consecutive numbers from zero to VF. 1897 for (int i = 0; i < VLen; ++i) 1898 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1899 1900 // Add the consecutive indices to the vector value. 1901 Constant *Cv = ConstantVector::get(Indices); 1902 1903 Step = Builder.CreateVectorSplat(VLen, Step); 1904 1905 // Floating point operations had to be 'fast' to enable the induction. 1906 FastMathFlags Flags; 1907 Flags.setFast(); 1908 1909 Value *MulOp = Builder.CreateFMul(Cv, Step); 1910 if (isa<Instruction>(MulOp)) 1911 // Have to check, MulOp may be a constant 1912 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1913 1914 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1915 if (isa<Instruction>(BOp)) 1916 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1917 return BOp; 1918 } 1919 1920 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1921 Instruction *EntryVal, 1922 const InductionDescriptor &ID) { 1923 // We shouldn't have to build scalar steps if we aren't vectorizing. 1924 assert(VF > 1 && "VF should be greater than one"); 1925 1926 // Get the value type and ensure it and the step have the same integer type. 1927 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1928 assert(ScalarIVTy == Step->getType() && 1929 "Val and Step should have the same type"); 1930 1931 // We build scalar steps for both integer and floating-point induction 1932 // variables. Here, we determine the kind of arithmetic we will perform. 1933 Instruction::BinaryOps AddOp; 1934 Instruction::BinaryOps MulOp; 1935 if (ScalarIVTy->isIntegerTy()) { 1936 AddOp = Instruction::Add; 1937 MulOp = Instruction::Mul; 1938 } else { 1939 AddOp = ID.getInductionOpcode(); 1940 MulOp = Instruction::FMul; 1941 } 1942 1943 // Determine the number of scalars we need to generate for each unroll 1944 // iteration. If EntryVal is uniform, we only need to generate the first 1945 // lane. Otherwise, we generate all VF values. 1946 unsigned Lanes = 1947 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 1948 : VF; 1949 // Compute the scalar steps and save the results in VectorLoopValueMap. 1950 for (unsigned Part = 0; Part < UF; ++Part) { 1951 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 1952 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 1953 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 1954 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 1955 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 1956 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 1957 } 1958 } 1959 } 1960 1961 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 1962 assert(V != Induction && "The new induction variable should not be used."); 1963 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 1964 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 1965 1966 // If we have a stride that is replaced by one, do it here. Defer this for 1967 // the VPlan-native path until we start running Legal checks in that path. 1968 if (!EnableVPlanNativePath && Legal->hasStride(V)) 1969 V = ConstantInt::get(V->getType(), 1); 1970 1971 // If we have a vector mapped to this value, return it. 1972 if (VectorLoopValueMap.hasVectorValue(V, Part)) 1973 return VectorLoopValueMap.getVectorValue(V, Part); 1974 1975 // If the value has not been vectorized, check if it has been scalarized 1976 // instead. If it has been scalarized, and we actually need the value in 1977 // vector form, we will construct the vector values on demand. 1978 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 1979 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 1980 1981 // If we've scalarized a value, that value should be an instruction. 1982 auto *I = cast<Instruction>(V); 1983 1984 // If we aren't vectorizing, we can just copy the scalar map values over to 1985 // the vector map. 1986 if (VF == 1) { 1987 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 1988 return ScalarValue; 1989 } 1990 1991 // Get the last scalar instruction we generated for V and Part. If the value 1992 // is known to be uniform after vectorization, this corresponds to lane zero 1993 // of the Part unroll iteration. Otherwise, the last instruction is the one 1994 // we created for the last vector lane of the Part unroll iteration. 1995 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 1996 auto *LastInst = cast<Instruction>( 1997 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 1998 1999 // Set the insert point after the last scalarized instruction. This ensures 2000 // the insertelement sequence will directly follow the scalar definitions. 2001 auto OldIP = Builder.saveIP(); 2002 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2003 Builder.SetInsertPoint(&*NewIP); 2004 2005 // However, if we are vectorizing, we need to construct the vector values. 2006 // If the value is known to be uniform after vectorization, we can just 2007 // broadcast the scalar value corresponding to lane zero for each unroll 2008 // iteration. Otherwise, we construct the vector values using insertelement 2009 // instructions. Since the resulting vectors are stored in 2010 // VectorLoopValueMap, we will only generate the insertelements once. 2011 Value *VectorValue = nullptr; 2012 if (Cost->isUniformAfterVectorization(I, VF)) { 2013 VectorValue = getBroadcastInstrs(ScalarValue); 2014 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2015 } else { 2016 // Initialize packing with insertelements to start from undef. 2017 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2018 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2019 for (unsigned Lane = 0; Lane < VF; ++Lane) 2020 packScalarIntoVectorValue(V, {Part, Lane}); 2021 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2022 } 2023 Builder.restoreIP(OldIP); 2024 return VectorValue; 2025 } 2026 2027 // If this scalar is unknown, assume that it is a constant or that it is 2028 // loop invariant. Broadcast V and save the value for future uses. 2029 Value *B = getBroadcastInstrs(V); 2030 VectorLoopValueMap.setVectorValue(V, Part, B); 2031 return B; 2032 } 2033 2034 Value * 2035 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2036 const VPIteration &Instance) { 2037 // If the value is not an instruction contained in the loop, it should 2038 // already be scalar. 2039 if (OrigLoop->isLoopInvariant(V)) 2040 return V; 2041 2042 assert(Instance.Lane > 0 2043 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2044 : true && "Uniform values only have lane zero"); 2045 2046 // If the value from the original loop has not been vectorized, it is 2047 // represented by UF x VF scalar values in the new loop. Return the requested 2048 // scalar value. 2049 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2050 return VectorLoopValueMap.getScalarValue(V, Instance); 2051 2052 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2053 // for the given unroll part. If this entry is not a vector type (i.e., the 2054 // vectorization factor is one), there is no need to generate an 2055 // extractelement instruction. 2056 auto *U = getOrCreateVectorValue(V, Instance.Part); 2057 if (!U->getType()->isVectorTy()) { 2058 assert(VF == 1 && "Value not scalarized has non-vector type"); 2059 return U; 2060 } 2061 2062 // Otherwise, the value from the original loop has been vectorized and is 2063 // represented by UF vector values. Extract and return the requested scalar 2064 // value from the appropriate vector lane. 2065 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2066 } 2067 2068 void InnerLoopVectorizer::packScalarIntoVectorValue( 2069 Value *V, const VPIteration &Instance) { 2070 assert(V != Induction && "The new induction variable should not be used."); 2071 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2072 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2073 2074 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2075 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2076 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2077 Builder.getInt32(Instance.Lane)); 2078 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2079 } 2080 2081 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2082 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2083 SmallVector<Constant *, 8> ShuffleMask; 2084 for (unsigned i = 0; i < VF; ++i) 2085 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 2086 2087 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2088 ConstantVector::get(ShuffleMask), 2089 "reverse"); 2090 } 2091 2092 // Return whether we allow using masked interleave-groups (for dealing with 2093 // strided loads/stores that reside in predicated blocks, or for dealing 2094 // with gaps). 2095 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2096 // If an override option has been passed in for interleaved accesses, use it. 2097 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2098 return EnableMaskedInterleavedMemAccesses; 2099 2100 return TTI.enableMaskedInterleavedAccessVectorization(); 2101 } 2102 2103 // Try to vectorize the interleave group that \p Instr belongs to. 2104 // 2105 // E.g. Translate following interleaved load group (factor = 3): 2106 // for (i = 0; i < N; i+=3) { 2107 // R = Pic[i]; // Member of index 0 2108 // G = Pic[i+1]; // Member of index 1 2109 // B = Pic[i+2]; // Member of index 2 2110 // ... // do something to R, G, B 2111 // } 2112 // To: 2113 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2114 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2115 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2116 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2117 // 2118 // Or translate following interleaved store group (factor = 3): 2119 // for (i = 0; i < N; i+=3) { 2120 // ... do something to R, G, B 2121 // Pic[i] = R; // Member of index 0 2122 // Pic[i+1] = G; // Member of index 1 2123 // Pic[i+2] = B; // Member of index 2 2124 // } 2125 // To: 2126 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2127 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2128 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2129 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2130 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2131 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, 2132 VectorParts *BlockInMask) { 2133 const InterleaveGroup<Instruction> *Group = 2134 Cost->getInterleavedAccessGroup(Instr); 2135 assert(Group && "Fail to get an interleaved access group."); 2136 2137 // Skip if current instruction is not the insert position. 2138 if (Instr != Group->getInsertPos()) 2139 return; 2140 2141 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2142 Value *Ptr = getLoadStorePointerOperand(Instr); 2143 2144 // Prepare for the vector type of the interleaved load/store. 2145 Type *ScalarTy = getMemInstValueType(Instr); 2146 unsigned InterleaveFactor = Group->getFactor(); 2147 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2148 Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr)); 2149 2150 // Prepare for the new pointers. 2151 setDebugLocFromInst(Builder, Ptr); 2152 SmallVector<Value *, 2> NewPtrs; 2153 unsigned Index = Group->getIndex(Instr); 2154 2155 VectorParts Mask; 2156 bool IsMaskForCondRequired = BlockInMask; 2157 if (IsMaskForCondRequired) { 2158 Mask = *BlockInMask; 2159 // TODO: extend the masked interleaved-group support to reversed access. 2160 assert(!Group->isReverse() && "Reversed masked interleave-group " 2161 "not supported."); 2162 } 2163 2164 // If the group is reverse, adjust the index to refer to the last vector lane 2165 // instead of the first. We adjust the index from the first vector lane, 2166 // rather than directly getting the pointer for lane VF - 1, because the 2167 // pointer operand of the interleaved access is supposed to be uniform. For 2168 // uniform instructions, we're only required to generate a value for the 2169 // first vector lane in each unroll iteration. 2170 if (Group->isReverse()) 2171 Index += (VF - 1) * Group->getFactor(); 2172 2173 bool InBounds = false; 2174 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2175 InBounds = gep->isInBounds(); 2176 2177 for (unsigned Part = 0; Part < UF; Part++) { 2178 Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); 2179 2180 // Notice current instruction could be any index. Need to adjust the address 2181 // to the member of index 0. 2182 // 2183 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2184 // b = A[i]; // Member of index 0 2185 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2186 // 2187 // E.g. A[i+1] = a; // Member of index 1 2188 // A[i] = b; // Member of index 0 2189 // A[i+2] = c; // Member of index 2 (Current instruction) 2190 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2191 NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index)); 2192 if (InBounds) 2193 cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true); 2194 2195 // Cast to the vector pointer type. 2196 NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); 2197 } 2198 2199 setDebugLocFromInst(Builder, Instr); 2200 Value *UndefVec = UndefValue::get(VecTy); 2201 2202 Value *MaskForGaps = nullptr; 2203 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2204 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2205 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2206 } 2207 2208 // Vectorize the interleaved load group. 2209 if (isa<LoadInst>(Instr)) { 2210 // For each unroll part, create a wide load for the group. 2211 SmallVector<Value *, 2> NewLoads; 2212 for (unsigned Part = 0; Part < UF; Part++) { 2213 Instruction *NewLoad; 2214 if (IsMaskForCondRequired || MaskForGaps) { 2215 assert(useMaskedInterleavedAccesses(*TTI) && 2216 "masked interleaved groups are not allowed."); 2217 Value *GroupMask = MaskForGaps; 2218 if (IsMaskForCondRequired) { 2219 auto *Undefs = UndefValue::get(Mask[Part]->getType()); 2220 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2221 Value *ShuffledMask = Builder.CreateShuffleVector( 2222 Mask[Part], Undefs, RepMask, "interleaved.mask"); 2223 GroupMask = MaskForGaps 2224 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2225 MaskForGaps) 2226 : ShuffledMask; 2227 } 2228 NewLoad = 2229 Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), 2230 GroupMask, UndefVec, "wide.masked.vec"); 2231 } 2232 else 2233 NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part], 2234 Group->getAlignment(), "wide.vec"); 2235 Group->addMetadata(NewLoad); 2236 NewLoads.push_back(NewLoad); 2237 } 2238 2239 // For each member in the group, shuffle out the appropriate data from the 2240 // wide loads. 2241 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2242 Instruction *Member = Group->getMember(I); 2243 2244 // Skip the gaps in the group. 2245 if (!Member) 2246 continue; 2247 2248 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); 2249 for (unsigned Part = 0; Part < UF; Part++) { 2250 Value *StridedVec = Builder.CreateShuffleVector( 2251 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2252 2253 // If this member has different type, cast the result type. 2254 if (Member->getType() != ScalarTy) { 2255 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2256 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2257 } 2258 2259 if (Group->isReverse()) 2260 StridedVec = reverseVector(StridedVec); 2261 2262 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2263 } 2264 } 2265 return; 2266 } 2267 2268 // The sub vector type for current instruction. 2269 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2270 2271 // Vectorize the interleaved store group. 2272 for (unsigned Part = 0; Part < UF; Part++) { 2273 // Collect the stored vector from each member. 2274 SmallVector<Value *, 4> StoredVecs; 2275 for (unsigned i = 0; i < InterleaveFactor; i++) { 2276 // Interleaved store group doesn't allow a gap, so each index has a member 2277 Instruction *Member = Group->getMember(i); 2278 assert(Member && "Fail to get a member from an interleaved store group"); 2279 2280 Value *StoredVec = getOrCreateVectorValue( 2281 cast<StoreInst>(Member)->getValueOperand(), Part); 2282 if (Group->isReverse()) 2283 StoredVec = reverseVector(StoredVec); 2284 2285 // If this member has different type, cast it to a unified type. 2286 2287 if (StoredVec->getType() != SubVT) 2288 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2289 2290 StoredVecs.push_back(StoredVec); 2291 } 2292 2293 // Concatenate all vectors into a wide vector. 2294 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2295 2296 // Interleave the elements in the wide vector. 2297 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); 2298 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, 2299 "interleaved.vec"); 2300 2301 Instruction *NewStoreInstr; 2302 if (IsMaskForCondRequired) { 2303 auto *Undefs = UndefValue::get(Mask[Part]->getType()); 2304 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2305 Value *ShuffledMask = Builder.CreateShuffleVector( 2306 Mask[Part], Undefs, RepMask, "interleaved.mask"); 2307 NewStoreInstr = Builder.CreateMaskedStore( 2308 IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask); 2309 } 2310 else 2311 NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], 2312 Group->getAlignment()); 2313 2314 Group->addMetadata(NewStoreInstr); 2315 } 2316 } 2317 2318 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2319 VectorParts *BlockInMask) { 2320 // Attempt to issue a wide load. 2321 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2322 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2323 2324 assert((LI || SI) && "Invalid Load/Store instruction"); 2325 2326 LoopVectorizationCostModel::InstWidening Decision = 2327 Cost->getWideningDecision(Instr, VF); 2328 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 2329 "CM decision should be taken at this point"); 2330 if (Decision == LoopVectorizationCostModel::CM_Interleave) 2331 return vectorizeInterleaveGroup(Instr); 2332 2333 Type *ScalarDataTy = getMemInstValueType(Instr); 2334 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2335 Value *Ptr = getLoadStorePointerOperand(Instr); 2336 unsigned Alignment = getLoadStoreAlignment(Instr); 2337 // An alignment of 0 means target abi alignment. We need to use the scalar's 2338 // target abi alignment in such a case. 2339 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2340 if (!Alignment) 2341 Alignment = DL.getABITypeAlignment(ScalarDataTy); 2342 unsigned AddressSpace = getLoadStoreAddressSpace(Instr); 2343 2344 // Determine if the pointer operand of the access is either consecutive or 2345 // reverse consecutive. 2346 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2347 bool ConsecutiveStride = 2348 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2349 bool CreateGatherScatter = 2350 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2351 2352 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2353 // gather/scatter. Otherwise Decision should have been to Scalarize. 2354 assert((ConsecutiveStride || CreateGatherScatter) && 2355 "The instruction should be scalarized"); 2356 2357 // Handle consecutive loads/stores. 2358 if (ConsecutiveStride) 2359 Ptr = getOrCreateScalarValue(Ptr, {0, 0}); 2360 2361 VectorParts Mask; 2362 bool isMaskRequired = BlockInMask; 2363 if (isMaskRequired) 2364 Mask = *BlockInMask; 2365 2366 bool InBounds = false; 2367 if (auto *gep = dyn_cast<GetElementPtrInst>( 2368 getLoadStorePointerOperand(Instr)->stripPointerCasts())) 2369 InBounds = gep->isInBounds(); 2370 2371 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2372 // Calculate the pointer for the specific unroll-part. 2373 GetElementPtrInst *PartPtr = nullptr; 2374 2375 if (Reverse) { 2376 // If the address is consecutive but reversed, then the 2377 // wide store needs to start at the last vector element. 2378 PartPtr = cast<GetElementPtrInst>( 2379 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2380 PartPtr->setIsInBounds(InBounds); 2381 PartPtr = cast<GetElementPtrInst>( 2382 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2383 PartPtr->setIsInBounds(InBounds); 2384 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2385 Mask[Part] = reverseVector(Mask[Part]); 2386 } else { 2387 PartPtr = cast<GetElementPtrInst>( 2388 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2389 PartPtr->setIsInBounds(InBounds); 2390 } 2391 2392 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2393 }; 2394 2395 // Handle Stores: 2396 if (SI) { 2397 setDebugLocFromInst(Builder, SI); 2398 2399 for (unsigned Part = 0; Part < UF; ++Part) { 2400 Instruction *NewSI = nullptr; 2401 Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); 2402 if (CreateGatherScatter) { 2403 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; 2404 Value *VectorGep = getOrCreateVectorValue(Ptr, Part); 2405 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2406 MaskPart); 2407 } else { 2408 if (Reverse) { 2409 // If we store to reverse consecutive memory locations, then we need 2410 // to reverse the order of elements in the stored value. 2411 StoredVal = reverseVector(StoredVal); 2412 // We don't want to update the value in the map as it might be used in 2413 // another expression. So don't call resetVectorValue(StoredVal). 2414 } 2415 auto *VecPtr = CreateVecPtr(Part, Ptr); 2416 if (isMaskRequired) 2417 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2418 Mask[Part]); 2419 else 2420 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2421 } 2422 addMetadata(NewSI, SI); 2423 } 2424 return; 2425 } 2426 2427 // Handle loads. 2428 assert(LI && "Must have a load instruction"); 2429 setDebugLocFromInst(Builder, LI); 2430 for (unsigned Part = 0; Part < UF; ++Part) { 2431 Value *NewLI; 2432 if (CreateGatherScatter) { 2433 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; 2434 Value *VectorGep = getOrCreateVectorValue(Ptr, Part); 2435 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2436 nullptr, "wide.masked.gather"); 2437 addMetadata(NewLI, LI); 2438 } else { 2439 auto *VecPtr = CreateVecPtr(Part, Ptr); 2440 if (isMaskRequired) 2441 NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], 2442 UndefValue::get(DataTy), 2443 "wide.masked.load"); 2444 else 2445 NewLI = 2446 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2447 2448 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2449 addMetadata(NewLI, LI); 2450 if (Reverse) 2451 NewLI = reverseVector(NewLI); 2452 } 2453 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2454 } 2455 } 2456 2457 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2458 const VPIteration &Instance, 2459 bool IfPredicateInstr) { 2460 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2461 2462 setDebugLocFromInst(Builder, Instr); 2463 2464 // Does this instruction return a value ? 2465 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2466 2467 Instruction *Cloned = Instr->clone(); 2468 if (!IsVoidRetTy) 2469 Cloned->setName(Instr->getName() + ".cloned"); 2470 2471 // Replace the operands of the cloned instructions with their scalar 2472 // equivalents in the new loop. 2473 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2474 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); 2475 Cloned->setOperand(op, NewOp); 2476 } 2477 addNewMetadata(Cloned, Instr); 2478 2479 // Place the cloned scalar in the new loop. 2480 Builder.Insert(Cloned); 2481 2482 // Add the cloned scalar to the scalar map entry. 2483 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2484 2485 // If we just cloned a new assumption, add it the assumption cache. 2486 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2487 if (II->getIntrinsicID() == Intrinsic::assume) 2488 AC->registerAssumption(II); 2489 2490 // End if-block. 2491 if (IfPredicateInstr) 2492 PredicatedInstructions.push_back(Cloned); 2493 } 2494 2495 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2496 Value *End, Value *Step, 2497 Instruction *DL) { 2498 BasicBlock *Header = L->getHeader(); 2499 BasicBlock *Latch = L->getLoopLatch(); 2500 // As we're just creating this loop, it's possible no latch exists 2501 // yet. If so, use the header as this will be a single block loop. 2502 if (!Latch) 2503 Latch = Header; 2504 2505 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2506 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2507 setDebugLocFromInst(Builder, OldInst); 2508 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2509 2510 Builder.SetInsertPoint(Latch->getTerminator()); 2511 setDebugLocFromInst(Builder, OldInst); 2512 2513 // Create i+1 and fill the PHINode. 2514 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2515 Induction->addIncoming(Start, L->getLoopPreheader()); 2516 Induction->addIncoming(Next, Latch); 2517 // Create the compare. 2518 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2519 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2520 2521 // Now we have two terminators. Remove the old one from the block. 2522 Latch->getTerminator()->eraseFromParent(); 2523 2524 return Induction; 2525 } 2526 2527 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2528 if (TripCount) 2529 return TripCount; 2530 2531 assert(L && "Create Trip Count for null loop."); 2532 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2533 // Find the loop boundaries. 2534 ScalarEvolution *SE = PSE.getSE(); 2535 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2536 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2537 "Invalid loop count"); 2538 2539 Type *IdxTy = Legal->getWidestInductionType(); 2540 assert(IdxTy && "No type for induction"); 2541 2542 // The exit count might have the type of i64 while the phi is i32. This can 2543 // happen if we have an induction variable that is sign extended before the 2544 // compare. The only way that we get a backedge taken count is that the 2545 // induction variable was signed and as such will not overflow. In such a case 2546 // truncation is legal. 2547 if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > 2548 IdxTy->getPrimitiveSizeInBits()) 2549 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2550 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2551 2552 // Get the total trip count from the count by adding 1. 2553 const SCEV *ExitCount = SE->getAddExpr( 2554 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2555 2556 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2557 2558 // Expand the trip count and place the new instructions in the preheader. 2559 // Notice that the pre-header does not change, only the loop body. 2560 SCEVExpander Exp(*SE, DL, "induction"); 2561 2562 // Count holds the overall loop count (N). 2563 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2564 L->getLoopPreheader()->getTerminator()); 2565 2566 if (TripCount->getType()->isPointerTy()) 2567 TripCount = 2568 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2569 L->getLoopPreheader()->getTerminator()); 2570 2571 return TripCount; 2572 } 2573 2574 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2575 if (VectorTripCount) 2576 return VectorTripCount; 2577 2578 Value *TC = getOrCreateTripCount(L); 2579 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2580 2581 Type *Ty = TC->getType(); 2582 Constant *Step = ConstantInt::get(Ty, VF * UF); 2583 2584 // If the tail is to be folded by masking, round the number of iterations N 2585 // up to a multiple of Step instead of rounding down. This is done by first 2586 // adding Step-1 and then rounding down. Note that it's ok if this addition 2587 // overflows: the vector induction variable will eventually wrap to zero given 2588 // that it starts at zero and its Step is a power of two; the loop will then 2589 // exit, with the last early-exit vector comparison also producing all-true. 2590 if (Cost->foldTailByMasking()) { 2591 assert(isPowerOf2_32(VF * UF) && 2592 "VF*UF must be a power of 2 when folding tail by masking"); 2593 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2594 } 2595 2596 // Now we need to generate the expression for the part of the loop that the 2597 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2598 // iterations are not required for correctness, or N - Step, otherwise. Step 2599 // is equal to the vectorization factor (number of SIMD elements) times the 2600 // unroll factor (number of SIMD instructions). 2601 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2602 2603 // If there is a non-reversed interleaved group that may speculatively access 2604 // memory out-of-bounds, we need to ensure that there will be at least one 2605 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2606 // the trip count, we set the remainder to be equal to the step. If the step 2607 // does not evenly divide the trip count, no adjustment is necessary since 2608 // there will already be scalar iterations. Note that the minimum iterations 2609 // check ensures that N >= Step. 2610 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2611 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2612 R = Builder.CreateSelect(IsZero, Step, R); 2613 } 2614 2615 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2616 2617 return VectorTripCount; 2618 } 2619 2620 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2621 const DataLayout &DL) { 2622 // Verify that V is a vector type with same number of elements as DstVTy. 2623 unsigned VF = DstVTy->getNumElements(); 2624 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2625 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2626 Type *SrcElemTy = SrcVecTy->getElementType(); 2627 Type *DstElemTy = DstVTy->getElementType(); 2628 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2629 "Vector elements must have same size"); 2630 2631 // Do a direct cast if element types are castable. 2632 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2633 return Builder.CreateBitOrPointerCast(V, DstVTy); 2634 } 2635 // V cannot be directly casted to desired vector type. 2636 // May happen when V is a floating point vector but DstVTy is a vector of 2637 // pointers or vice-versa. Handle this using a two-step bitcast using an 2638 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2639 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2640 "Only one type should be a pointer type"); 2641 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2642 "Only one type should be a floating point type"); 2643 Type *IntTy = 2644 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2645 VectorType *VecIntTy = VectorType::get(IntTy, VF); 2646 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2647 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2648 } 2649 2650 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2651 BasicBlock *Bypass) { 2652 Value *Count = getOrCreateTripCount(L); 2653 BasicBlock *BB = L->getLoopPreheader(); 2654 IRBuilder<> Builder(BB->getTerminator()); 2655 2656 // Generate code to check if the loop's trip count is less than VF * UF, or 2657 // equal to it in case a scalar epilogue is required; this implies that the 2658 // vector trip count is zero. This check also covers the case where adding one 2659 // to the backedge-taken count overflowed leading to an incorrect trip count 2660 // of zero. In this case we will also jump to the scalar loop. 2661 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2662 : ICmpInst::ICMP_ULT; 2663 2664 // If tail is to be folded, vector loop takes care of all iterations. 2665 Value *CheckMinIters = Builder.getFalse(); 2666 if (!Cost->foldTailByMasking()) 2667 CheckMinIters = Builder.CreateICmp( 2668 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2669 "min.iters.check"); 2670 2671 BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2672 // Update dominator tree immediately if the generated block is a 2673 // LoopBypassBlock because SCEV expansions to generate loop bypass 2674 // checks may query it before the current function is finished. 2675 DT->addNewBlock(NewBB, BB); 2676 if (L->getParentLoop()) 2677 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2678 ReplaceInstWithInst(BB->getTerminator(), 2679 BranchInst::Create(Bypass, NewBB, CheckMinIters)); 2680 LoopBypassBlocks.push_back(BB); 2681 } 2682 2683 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2684 BasicBlock *BB = L->getLoopPreheader(); 2685 2686 // Generate the code to check that the SCEV assumptions that we made. 2687 // We want the new basic block to start at the first instruction in a 2688 // sequence of instructions that form a check. 2689 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2690 "scev.check"); 2691 Value *SCEVCheck = 2692 Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); 2693 2694 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2695 if (C->isZero()) 2696 return; 2697 2698 assert(!BB->getParent()->hasOptSize() && 2699 "Cannot SCEV check stride or overflow when optimizing for size"); 2700 2701 // Create a new block containing the stride check. 2702 BB->setName("vector.scevcheck"); 2703 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2704 // Update dominator tree immediately if the generated block is a 2705 // LoopBypassBlock because SCEV expansions to generate loop bypass 2706 // checks may query it before the current function is finished. 2707 DT->addNewBlock(NewBB, BB); 2708 if (L->getParentLoop()) 2709 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2710 ReplaceInstWithInst(BB->getTerminator(), 2711 BranchInst::Create(Bypass, NewBB, SCEVCheck)); 2712 LoopBypassBlocks.push_back(BB); 2713 AddedSafetyChecks = true; 2714 } 2715 2716 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2717 // VPlan-native path does not do any analysis for runtime checks currently. 2718 if (EnableVPlanNativePath) 2719 return; 2720 2721 BasicBlock *BB = L->getLoopPreheader(); 2722 2723 // Generate the code that checks in runtime if arrays overlap. We put the 2724 // checks into a separate block to make the more common case of few elements 2725 // faster. 2726 Instruction *FirstCheckInst; 2727 Instruction *MemRuntimeCheck; 2728 std::tie(FirstCheckInst, MemRuntimeCheck) = 2729 Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); 2730 if (!MemRuntimeCheck) 2731 return; 2732 2733 if (BB->getParent()->hasOptSize()) { 2734 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2735 "Cannot emit memory checks when optimizing for size, unless forced " 2736 "to vectorize."); 2737 ORE->emit([&]() { 2738 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2739 L->getStartLoc(), L->getHeader()) 2740 << "Code-size may be reduced by not forcing " 2741 "vectorization, or by source-code modifications " 2742 "eliminating the need for runtime checks " 2743 "(e.g., adding 'restrict')."; 2744 }); 2745 } 2746 2747 // Create a new block containing the memory check. 2748 BB->setName("vector.memcheck"); 2749 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2750 // Update dominator tree immediately if the generated block is a 2751 // LoopBypassBlock because SCEV expansions to generate loop bypass 2752 // checks may query it before the current function is finished. 2753 DT->addNewBlock(NewBB, BB); 2754 if (L->getParentLoop()) 2755 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2756 ReplaceInstWithInst(BB->getTerminator(), 2757 BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); 2758 LoopBypassBlocks.push_back(BB); 2759 AddedSafetyChecks = true; 2760 2761 // We currently don't use LoopVersioning for the actual loop cloning but we 2762 // still use it to add the noalias metadata. 2763 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2764 PSE.getSE()); 2765 LVer->prepareNoAliasMetadata(); 2766 } 2767 2768 Value *InnerLoopVectorizer::emitTransformedIndex( 2769 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2770 const InductionDescriptor &ID) const { 2771 2772 SCEVExpander Exp(*SE, DL, "induction"); 2773 auto Step = ID.getStep(); 2774 auto StartValue = ID.getStartValue(); 2775 assert(Index->getType() == Step->getType() && 2776 "Index type does not match StepValue type"); 2777 2778 // Note: the IR at this point is broken. We cannot use SE to create any new 2779 // SCEV and then expand it, hoping that SCEV's simplification will give us 2780 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2781 // lead to various SCEV crashes. So all we can do is to use builder and rely 2782 // on InstCombine for future simplifications. Here we handle some trivial 2783 // cases only. 2784 auto CreateAdd = [&B](Value *X, Value *Y) { 2785 assert(X->getType() == Y->getType() && "Types don't match!"); 2786 if (auto *CX = dyn_cast<ConstantInt>(X)) 2787 if (CX->isZero()) 2788 return Y; 2789 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2790 if (CY->isZero()) 2791 return X; 2792 return B.CreateAdd(X, Y); 2793 }; 2794 2795 auto CreateMul = [&B](Value *X, Value *Y) { 2796 assert(X->getType() == Y->getType() && "Types don't match!"); 2797 if (auto *CX = dyn_cast<ConstantInt>(X)) 2798 if (CX->isOne()) 2799 return Y; 2800 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2801 if (CY->isOne()) 2802 return X; 2803 return B.CreateMul(X, Y); 2804 }; 2805 2806 switch (ID.getKind()) { 2807 case InductionDescriptor::IK_IntInduction: { 2808 assert(Index->getType() == StartValue->getType() && 2809 "Index type does not match StartValue type"); 2810 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2811 return B.CreateSub(StartValue, Index); 2812 auto *Offset = CreateMul( 2813 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2814 return CreateAdd(StartValue, Offset); 2815 } 2816 case InductionDescriptor::IK_PtrInduction: { 2817 assert(isa<SCEVConstant>(Step) && 2818 "Expected constant step for pointer induction"); 2819 return B.CreateGEP( 2820 StartValue->getType()->getPointerElementType(), StartValue, 2821 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2822 &*B.GetInsertPoint()))); 2823 } 2824 case InductionDescriptor::IK_FpInduction: { 2825 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2826 auto InductionBinOp = ID.getInductionBinOp(); 2827 assert(InductionBinOp && 2828 (InductionBinOp->getOpcode() == Instruction::FAdd || 2829 InductionBinOp->getOpcode() == Instruction::FSub) && 2830 "Original bin op should be defined for FP induction"); 2831 2832 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2833 2834 // Floating point operations had to be 'fast' to enable the induction. 2835 FastMathFlags Flags; 2836 Flags.setFast(); 2837 2838 Value *MulExp = B.CreateFMul(StepValue, Index); 2839 if (isa<Instruction>(MulExp)) 2840 // We have to check, the MulExp may be a constant. 2841 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2842 2843 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2844 "induction"); 2845 if (isa<Instruction>(BOp)) 2846 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2847 2848 return BOp; 2849 } 2850 case InductionDescriptor::IK_NoInduction: 2851 return nullptr; 2852 } 2853 llvm_unreachable("invalid enum"); 2854 } 2855 2856 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2857 /* 2858 In this function we generate a new loop. The new loop will contain 2859 the vectorized instructions while the old loop will continue to run the 2860 scalar remainder. 2861 2862 [ ] <-- loop iteration number check. 2863 / | 2864 / v 2865 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2866 | / | 2867 | / v 2868 || [ ] <-- vector pre header. 2869 |/ | 2870 | v 2871 | [ ] \ 2872 | [ ]_| <-- vector loop. 2873 | | 2874 | v 2875 | -[ ] <--- middle-block. 2876 | / | 2877 | / v 2878 -|- >[ ] <--- new preheader. 2879 | | 2880 | v 2881 | [ ] \ 2882 | [ ]_| <-- old scalar loop to handle remainder. 2883 \ | 2884 \ v 2885 >[ ] <-- exit block. 2886 ... 2887 */ 2888 2889 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 2890 BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); 2891 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 2892 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2893 assert(VectorPH && "Invalid loop structure"); 2894 assert(ExitBlock && "Must have an exit block"); 2895 2896 // Some loops have a single integer induction variable, while other loops 2897 // don't. One example is c++ iterators that often have multiple pointer 2898 // induction variables. In the code below we also support a case where we 2899 // don't have a single induction variable. 2900 // 2901 // We try to obtain an induction variable from the original loop as hard 2902 // as possible. However if we don't find one that: 2903 // - is an integer 2904 // - counts from zero, stepping by one 2905 // - is the size of the widest induction variable type 2906 // then we create a new one. 2907 OldInduction = Legal->getPrimaryInduction(); 2908 Type *IdxTy = Legal->getWidestInductionType(); 2909 2910 // Split the single block loop into the two loop structure described above. 2911 BasicBlock *VecBody = 2912 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 2913 BasicBlock *MiddleBlock = 2914 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 2915 BasicBlock *ScalarPH = 2916 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 2917 2918 // Create and register the new vector loop. 2919 Loop *Lp = LI->AllocateLoop(); 2920 Loop *ParentLoop = OrigLoop->getParentLoop(); 2921 2922 // Insert the new loop into the loop nest and register the new basic blocks 2923 // before calling any utilities such as SCEV that require valid LoopInfo. 2924 if (ParentLoop) { 2925 ParentLoop->addChildLoop(Lp); 2926 ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); 2927 ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); 2928 } else { 2929 LI->addTopLevelLoop(Lp); 2930 } 2931 Lp->addBasicBlockToLoop(VecBody, *LI); 2932 2933 // Find the loop boundaries. 2934 Value *Count = getOrCreateTripCount(Lp); 2935 2936 Value *StartIdx = ConstantInt::get(IdxTy, 0); 2937 2938 // Now, compare the new count to zero. If it is zero skip the vector loop and 2939 // jump to the scalar loop. This check also covers the case where the 2940 // backedge-taken count is uint##_max: adding one to it will overflow leading 2941 // to an incorrect trip count of zero. In this (rare) case we will also jump 2942 // to the scalar loop. 2943 emitMinimumIterationCountCheck(Lp, ScalarPH); 2944 2945 // Generate the code to check any assumptions that we've made for SCEV 2946 // expressions. 2947 emitSCEVChecks(Lp, ScalarPH); 2948 2949 // Generate the code that checks in runtime if arrays overlap. We put the 2950 // checks into a separate block to make the more common case of few elements 2951 // faster. 2952 emitMemRuntimeChecks(Lp, ScalarPH); 2953 2954 // Generate the induction variable. 2955 // The loop step is equal to the vectorization factor (num of SIMD elements) 2956 // times the unroll factor (num of SIMD instructions). 2957 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 2958 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 2959 Induction = 2960 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 2961 getDebugLocFromInstOrOperands(OldInduction)); 2962 2963 // We are going to resume the execution of the scalar loop. 2964 // Go over all of the induction variables that we found and fix the 2965 // PHIs that are left in the scalar version of the loop. 2966 // The starting values of PHI nodes depend on the counter of the last 2967 // iteration in the vectorized loop. 2968 // If we come from a bypass edge then we need to start from the original 2969 // start value. 2970 2971 // This variable saves the new starting index for the scalar loop. It is used 2972 // to test if there are any tail iterations left once the vector loop has 2973 // completed. 2974 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 2975 for (auto &InductionEntry : *List) { 2976 PHINode *OrigPhi = InductionEntry.first; 2977 InductionDescriptor II = InductionEntry.second; 2978 2979 // Create phi nodes to merge from the backedge-taken check block. 2980 PHINode *BCResumeVal = PHINode::Create( 2981 OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); 2982 // Copy original phi DL over to the new one. 2983 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 2984 Value *&EndValue = IVEndValues[OrigPhi]; 2985 if (OrigPhi == OldInduction) { 2986 // We know what the end value is. 2987 EndValue = CountRoundDown; 2988 } else { 2989 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 2990 Type *StepType = II.getStep()->getType(); 2991 Instruction::CastOps CastOp = 2992 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 2993 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 2994 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2995 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 2996 EndValue->setName("ind.end"); 2997 } 2998 2999 // The new PHI merges the original incoming value, in case of a bypass, 3000 // or the value at the end of the vectorized loop. 3001 BCResumeVal->addIncoming(EndValue, MiddleBlock); 3002 3003 // Fix the scalar body counter (PHI node). 3004 // The old induction's phi node in the scalar body needs the truncated 3005 // value. 3006 for (BasicBlock *BB : LoopBypassBlocks) 3007 BCResumeVal->addIncoming(II.getStartValue(), BB); 3008 OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal); 3009 } 3010 3011 // We need the OrigLoop (scalar loop part) latch terminator to help 3012 // produce correct debug info for the middle block BB instructions. 3013 // The legality check stage guarantees that the loop will have a single 3014 // latch. 3015 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3016 "Scalar loop latch terminator isn't a branch"); 3017 BranchInst *ScalarLatchBr = 3018 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3019 3020 // Add a check in the middle block to see if we have completed 3021 // all of the iterations in the first vector loop. 3022 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3023 // If tail is to be folded, we know we don't need to run the remainder. 3024 Value *CmpN = Builder.getTrue(); 3025 if (!Cost->foldTailByMasking()) { 3026 CmpN = 3027 CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3028 CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); 3029 3030 // Here we use the same DebugLoc as the scalar loop latch branch instead 3031 // of the corresponding compare because they may have ended up with 3032 // different line numbers and we want to avoid awkward line stepping while 3033 // debugging. Eg. if the compare has got a line number inside the loop. 3034 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3035 } 3036 3037 BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN); 3038 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3039 ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst); 3040 3041 // Get ready to start creating new instructions into the vectorized body. 3042 Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); 3043 3044 // Save the state. 3045 LoopVectorPreHeader = Lp->getLoopPreheader(); 3046 LoopScalarPreHeader = ScalarPH; 3047 LoopMiddleBlock = MiddleBlock; 3048 LoopExitBlock = ExitBlock; 3049 LoopVectorBody = VecBody; 3050 LoopScalarBody = OldBasicBlock; 3051 3052 Optional<MDNode *> VectorizedLoopID = 3053 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3054 LLVMLoopVectorizeFollowupVectorized}); 3055 if (VectorizedLoopID.hasValue()) { 3056 Lp->setLoopID(VectorizedLoopID.getValue()); 3057 3058 // Do not setAlreadyVectorized if loop attributes have been defined 3059 // explicitly. 3060 return LoopVectorPreHeader; 3061 } 3062 3063 // Keep all loop hints from the original loop on the vector loop (we'll 3064 // replace the vectorizer-specific hints below). 3065 if (MDNode *LID = OrigLoop->getLoopID()) 3066 Lp->setLoopID(LID); 3067 3068 LoopVectorizeHints Hints(Lp, true, *ORE); 3069 Hints.setAlreadyVectorized(); 3070 3071 return LoopVectorPreHeader; 3072 } 3073 3074 // Fix up external users of the induction variable. At this point, we are 3075 // in LCSSA form, with all external PHIs that use the IV having one input value, 3076 // coming from the remainder loop. We need those PHIs to also have a correct 3077 // value for the IV when arriving directly from the middle block. 3078 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3079 const InductionDescriptor &II, 3080 Value *CountRoundDown, Value *EndValue, 3081 BasicBlock *MiddleBlock) { 3082 // There are two kinds of external IV usages - those that use the value 3083 // computed in the last iteration (the PHI) and those that use the penultimate 3084 // value (the value that feeds into the phi from the loop latch). 3085 // We allow both, but they, obviously, have different values. 3086 3087 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3088 3089 DenseMap<Value *, Value *> MissingVals; 3090 3091 // An external user of the last iteration's value should see the value that 3092 // the remainder loop uses to initialize its own IV. 3093 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3094 for (User *U : PostInc->users()) { 3095 Instruction *UI = cast<Instruction>(U); 3096 if (!OrigLoop->contains(UI)) { 3097 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3098 MissingVals[UI] = EndValue; 3099 } 3100 } 3101 3102 // An external user of the penultimate value need to see EndValue - Step. 3103 // The simplest way to get this is to recompute it from the constituent SCEVs, 3104 // that is Start + (Step * (CRD - 1)). 3105 for (User *U : OrigPhi->users()) { 3106 auto *UI = cast<Instruction>(U); 3107 if (!OrigLoop->contains(UI)) { 3108 const DataLayout &DL = 3109 OrigLoop->getHeader()->getModule()->getDataLayout(); 3110 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3111 3112 IRBuilder<> B(MiddleBlock->getTerminator()); 3113 Value *CountMinusOne = B.CreateSub( 3114 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3115 Value *CMO = 3116 !II.getStep()->getType()->isIntegerTy() 3117 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3118 II.getStep()->getType()) 3119 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3120 CMO->setName("cast.cmo"); 3121 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3122 Escape->setName("ind.escape"); 3123 MissingVals[UI] = Escape; 3124 } 3125 } 3126 3127 for (auto &I : MissingVals) { 3128 PHINode *PHI = cast<PHINode>(I.first); 3129 // One corner case we have to handle is two IVs "chasing" each-other, 3130 // that is %IV2 = phi [...], [ %IV1, %latch ] 3131 // In this case, if IV1 has an external use, we need to avoid adding both 3132 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3133 // don't already have an incoming value for the middle block. 3134 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3135 PHI->addIncoming(I.second, MiddleBlock); 3136 } 3137 } 3138 3139 namespace { 3140 3141 struct CSEDenseMapInfo { 3142 static bool canHandle(const Instruction *I) { 3143 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3144 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3145 } 3146 3147 static inline Instruction *getEmptyKey() { 3148 return DenseMapInfo<Instruction *>::getEmptyKey(); 3149 } 3150 3151 static inline Instruction *getTombstoneKey() { 3152 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3153 } 3154 3155 static unsigned getHashValue(const Instruction *I) { 3156 assert(canHandle(I) && "Unknown instruction!"); 3157 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3158 I->value_op_end())); 3159 } 3160 3161 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3162 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3163 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3164 return LHS == RHS; 3165 return LHS->isIdenticalTo(RHS); 3166 } 3167 }; 3168 3169 } // end anonymous namespace 3170 3171 ///Perform cse of induction variable instructions. 3172 static void cse(BasicBlock *BB) { 3173 // Perform simple cse. 3174 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3175 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3176 Instruction *In = &*I++; 3177 3178 if (!CSEDenseMapInfo::canHandle(In)) 3179 continue; 3180 3181 // Check if we can replace this instruction with any of the 3182 // visited instructions. 3183 if (Instruction *V = CSEMap.lookup(In)) { 3184 In->replaceAllUsesWith(V); 3185 In->eraseFromParent(); 3186 continue; 3187 } 3188 3189 CSEMap[In] = In; 3190 } 3191 } 3192 3193 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3194 unsigned VF, 3195 bool &NeedToScalarize) { 3196 Function *F = CI->getCalledFunction(); 3197 StringRef FnName = CI->getCalledFunction()->getName(); 3198 Type *ScalarRetTy = CI->getType(); 3199 SmallVector<Type *, 4> Tys, ScalarTys; 3200 for (auto &ArgOp : CI->arg_operands()) 3201 ScalarTys.push_back(ArgOp->getType()); 3202 3203 // Estimate cost of scalarized vector call. The source operands are assumed 3204 // to be vectors, so we need to extract individual elements from there, 3205 // execute VF scalar calls, and then gather the result into the vector return 3206 // value. 3207 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3208 if (VF == 1) 3209 return ScalarCallCost; 3210 3211 // Compute corresponding vector type for return value and arguments. 3212 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3213 for (Type *ScalarTy : ScalarTys) 3214 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3215 3216 // Compute costs of unpacking argument values for the scalar calls and 3217 // packing the return values to a vector. 3218 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3219 3220 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3221 3222 // If we can't emit a vector call for this function, then the currently found 3223 // cost is the cost we need to return. 3224 NeedToScalarize = true; 3225 if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) 3226 return Cost; 3227 3228 // If the corresponding vector cost is cheaper, return its cost. 3229 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3230 if (VectorCallCost < Cost) { 3231 NeedToScalarize = false; 3232 return VectorCallCost; 3233 } 3234 return Cost; 3235 } 3236 3237 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3238 unsigned VF) { 3239 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3240 assert(ID && "Expected intrinsic call!"); 3241 3242 FastMathFlags FMF; 3243 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3244 FMF = FPMO->getFastMathFlags(); 3245 3246 SmallVector<Value *, 4> Operands(CI->arg_operands()); 3247 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF); 3248 } 3249 3250 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3251 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3252 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3253 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3254 } 3255 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3256 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3257 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3258 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3259 } 3260 3261 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3262 // For every instruction `I` in MinBWs, truncate the operands, create a 3263 // truncated version of `I` and reextend its result. InstCombine runs 3264 // later and will remove any ext/trunc pairs. 3265 SmallPtrSet<Value *, 4> Erased; 3266 for (const auto &KV : Cost->getMinimalBitwidths()) { 3267 // If the value wasn't vectorized, we must maintain the original scalar 3268 // type. The absence of the value from VectorLoopValueMap indicates that it 3269 // wasn't vectorized. 3270 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3271 continue; 3272 for (unsigned Part = 0; Part < UF; ++Part) { 3273 Value *I = getOrCreateVectorValue(KV.first, Part); 3274 if (Erased.find(I) != Erased.end() || I->use_empty() || 3275 !isa<Instruction>(I)) 3276 continue; 3277 Type *OriginalTy = I->getType(); 3278 Type *ScalarTruncatedTy = 3279 IntegerType::get(OriginalTy->getContext(), KV.second); 3280 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, 3281 OriginalTy->getVectorNumElements()); 3282 if (TruncatedTy == OriginalTy) 3283 continue; 3284 3285 IRBuilder<> B(cast<Instruction>(I)); 3286 auto ShrinkOperand = [&](Value *V) -> Value * { 3287 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3288 if (ZI->getSrcTy() == TruncatedTy) 3289 return ZI->getOperand(0); 3290 return B.CreateZExtOrTrunc(V, TruncatedTy); 3291 }; 3292 3293 // The actual instruction modification depends on the instruction type, 3294 // unfortunately. 3295 Value *NewI = nullptr; 3296 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3297 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3298 ShrinkOperand(BO->getOperand(1))); 3299 3300 // Any wrapping introduced by shrinking this operation shouldn't be 3301 // considered undefined behavior. So, we can't unconditionally copy 3302 // arithmetic wrapping flags to NewI. 3303 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3304 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3305 NewI = 3306 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3307 ShrinkOperand(CI->getOperand(1))); 3308 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3309 NewI = B.CreateSelect(SI->getCondition(), 3310 ShrinkOperand(SI->getTrueValue()), 3311 ShrinkOperand(SI->getFalseValue())); 3312 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3313 switch (CI->getOpcode()) { 3314 default: 3315 llvm_unreachable("Unhandled cast!"); 3316 case Instruction::Trunc: 3317 NewI = ShrinkOperand(CI->getOperand(0)); 3318 break; 3319 case Instruction::SExt: 3320 NewI = B.CreateSExtOrTrunc( 3321 CI->getOperand(0), 3322 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3323 break; 3324 case Instruction::ZExt: 3325 NewI = B.CreateZExtOrTrunc( 3326 CI->getOperand(0), 3327 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3328 break; 3329 } 3330 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3331 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); 3332 auto *O0 = B.CreateZExtOrTrunc( 3333 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3334 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); 3335 auto *O1 = B.CreateZExtOrTrunc( 3336 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3337 3338 NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); 3339 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3340 // Don't do anything with the operands, just extend the result. 3341 continue; 3342 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3343 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements(); 3344 auto *O0 = B.CreateZExtOrTrunc( 3345 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3346 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3347 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3348 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3349 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements(); 3350 auto *O0 = B.CreateZExtOrTrunc( 3351 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3352 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3353 } else { 3354 // If we don't know what to do, be conservative and don't do anything. 3355 continue; 3356 } 3357 3358 // Lastly, extend the result. 3359 NewI->takeName(cast<Instruction>(I)); 3360 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3361 I->replaceAllUsesWith(Res); 3362 cast<Instruction>(I)->eraseFromParent(); 3363 Erased.insert(I); 3364 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3365 } 3366 } 3367 3368 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3369 for (const auto &KV : Cost->getMinimalBitwidths()) { 3370 // If the value wasn't vectorized, we must maintain the original scalar 3371 // type. The absence of the value from VectorLoopValueMap indicates that it 3372 // wasn't vectorized. 3373 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3374 continue; 3375 for (unsigned Part = 0; Part < UF; ++Part) { 3376 Value *I = getOrCreateVectorValue(KV.first, Part); 3377 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3378 if (Inst && Inst->use_empty()) { 3379 Value *NewI = Inst->getOperand(0); 3380 Inst->eraseFromParent(); 3381 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3382 } 3383 } 3384 } 3385 } 3386 3387 void InnerLoopVectorizer::fixVectorizedLoop() { 3388 // Insert truncates and extends for any truncated instructions as hints to 3389 // InstCombine. 3390 if (VF > 1) 3391 truncateToMinimalBitwidths(); 3392 3393 // Fix widened non-induction PHIs by setting up the PHI operands. 3394 if (OrigPHIsToFix.size()) { 3395 assert(EnableVPlanNativePath && 3396 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3397 fixNonInductionPHIs(); 3398 } 3399 3400 // At this point every instruction in the original loop is widened to a 3401 // vector form. Now we need to fix the recurrences in the loop. These PHI 3402 // nodes are currently empty because we did not want to introduce cycles. 3403 // This is the second stage of vectorizing recurrences. 3404 fixCrossIterationPHIs(); 3405 3406 // Update the dominator tree. 3407 // 3408 // FIXME: After creating the structure of the new loop, the dominator tree is 3409 // no longer up-to-date, and it remains that way until we update it 3410 // here. An out-of-date dominator tree is problematic for SCEV, 3411 // because SCEVExpander uses it to guide code generation. The 3412 // vectorizer use SCEVExpanders in several places. Instead, we should 3413 // keep the dominator tree up-to-date as we go. 3414 updateAnalysis(); 3415 3416 // Fix-up external users of the induction variables. 3417 for (auto &Entry : *Legal->getInductionVars()) 3418 fixupIVUsers(Entry.first, Entry.second, 3419 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3420 IVEndValues[Entry.first], LoopMiddleBlock); 3421 3422 fixLCSSAPHIs(); 3423 for (Instruction *PI : PredicatedInstructions) 3424 sinkScalarOperands(&*PI); 3425 3426 // Remove redundant induction instructions. 3427 cse(LoopVectorBody); 3428 } 3429 3430 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3431 // In order to support recurrences we need to be able to vectorize Phi nodes. 3432 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3433 // stage #2: We now need to fix the recurrences by adding incoming edges to 3434 // the currently empty PHI nodes. At this point every instruction in the 3435 // original loop is widened to a vector form so we can use them to construct 3436 // the incoming edges. 3437 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3438 // Handle first-order recurrences and reductions that need to be fixed. 3439 if (Legal->isFirstOrderRecurrence(&Phi)) 3440 fixFirstOrderRecurrence(&Phi); 3441 else if (Legal->isReductionVariable(&Phi)) 3442 fixReduction(&Phi); 3443 } 3444 } 3445 3446 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3447 // This is the second phase of vectorizing first-order recurrences. An 3448 // overview of the transformation is described below. Suppose we have the 3449 // following loop. 3450 // 3451 // for (int i = 0; i < n; ++i) 3452 // b[i] = a[i] - a[i - 1]; 3453 // 3454 // There is a first-order recurrence on "a". For this loop, the shorthand 3455 // scalar IR looks like: 3456 // 3457 // scalar.ph: 3458 // s_init = a[-1] 3459 // br scalar.body 3460 // 3461 // scalar.body: 3462 // i = phi [0, scalar.ph], [i+1, scalar.body] 3463 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3464 // s2 = a[i] 3465 // b[i] = s2 - s1 3466 // br cond, scalar.body, ... 3467 // 3468 // In this example, s1 is a recurrence because it's value depends on the 3469 // previous iteration. In the first phase of vectorization, we created a 3470 // temporary value for s1. We now complete the vectorization and produce the 3471 // shorthand vector IR shown below (for VF = 4, UF = 1). 3472 // 3473 // vector.ph: 3474 // v_init = vector(..., ..., ..., a[-1]) 3475 // br vector.body 3476 // 3477 // vector.body 3478 // i = phi [0, vector.ph], [i+4, vector.body] 3479 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3480 // v2 = a[i, i+1, i+2, i+3]; 3481 // v3 = vector(v1(3), v2(0, 1, 2)) 3482 // b[i, i+1, i+2, i+3] = v2 - v3 3483 // br cond, vector.body, middle.block 3484 // 3485 // middle.block: 3486 // x = v2(3) 3487 // br scalar.ph 3488 // 3489 // scalar.ph: 3490 // s_init = phi [x, middle.block], [a[-1], otherwise] 3491 // br scalar.body 3492 // 3493 // After execution completes the vector loop, we extract the next value of 3494 // the recurrence (x) to use as the initial value in the scalar loop. 3495 3496 // Get the original loop preheader and single loop latch. 3497 auto *Preheader = OrigLoop->getLoopPreheader(); 3498 auto *Latch = OrigLoop->getLoopLatch(); 3499 3500 // Get the initial and previous values of the scalar recurrence. 3501 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3502 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3503 3504 // Create a vector from the initial value. 3505 auto *VectorInit = ScalarInit; 3506 if (VF > 1) { 3507 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3508 VectorInit = Builder.CreateInsertElement( 3509 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3510 Builder.getInt32(VF - 1), "vector.recur.init"); 3511 } 3512 3513 // We constructed a temporary phi node in the first phase of vectorization. 3514 // This phi node will eventually be deleted. 3515 Builder.SetInsertPoint( 3516 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3517 3518 // Create a phi node for the new recurrence. The current value will either be 3519 // the initial value inserted into a vector or loop-varying vector value. 3520 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3521 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3522 3523 // Get the vectorized previous value of the last part UF - 1. It appears last 3524 // among all unrolled iterations, due to the order of their construction. 3525 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3526 3527 // Set the insertion point after the previous value if it is an instruction. 3528 // Note that the previous value may have been constant-folded so it is not 3529 // guaranteed to be an instruction in the vector loop. Also, if the previous 3530 // value is a phi node, we should insert after all the phi nodes to avoid 3531 // breaking basic block verification. 3532 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) || 3533 isa<PHINode>(PreviousLastPart)) 3534 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3535 else 3536 Builder.SetInsertPoint( 3537 &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart))); 3538 3539 // We will construct a vector for the recurrence by combining the values for 3540 // the current and previous iterations. This is the required shuffle mask. 3541 SmallVector<Constant *, 8> ShuffleMask(VF); 3542 ShuffleMask[0] = Builder.getInt32(VF - 1); 3543 for (unsigned I = 1; I < VF; ++I) 3544 ShuffleMask[I] = Builder.getInt32(I + VF - 1); 3545 3546 // The vector from which to take the initial value for the current iteration 3547 // (actual or unrolled). Initially, this is the vector phi node. 3548 Value *Incoming = VecPhi; 3549 3550 // Shuffle the current and previous vector and update the vector parts. 3551 for (unsigned Part = 0; Part < UF; ++Part) { 3552 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3553 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3554 auto *Shuffle = 3555 VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3556 ConstantVector::get(ShuffleMask)) 3557 : Incoming; 3558 PhiPart->replaceAllUsesWith(Shuffle); 3559 cast<Instruction>(PhiPart)->eraseFromParent(); 3560 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3561 Incoming = PreviousPart; 3562 } 3563 3564 // Fix the latch value of the new recurrence in the vector loop. 3565 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3566 3567 // Extract the last vector element in the middle block. This will be the 3568 // initial value for the recurrence when jumping to the scalar loop. 3569 auto *ExtractForScalar = Incoming; 3570 if (VF > 1) { 3571 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3572 ExtractForScalar = Builder.CreateExtractElement( 3573 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3574 } 3575 // Extract the second last element in the middle block if the 3576 // Phi is used outside the loop. We need to extract the phi itself 3577 // and not the last element (the phi update in the current iteration). This 3578 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3579 // when the scalar loop is not run at all. 3580 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3581 if (VF > 1) 3582 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3583 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3584 // When loop is unrolled without vectorizing, initialize 3585 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3586 // `Incoming`. This is analogous to the vectorized case above: extracting the 3587 // second last element when VF > 1. 3588 else if (UF > 1) 3589 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3590 3591 // Fix the initial value of the original recurrence in the scalar loop. 3592 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3593 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3594 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3595 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3596 Start->addIncoming(Incoming, BB); 3597 } 3598 3599 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3600 Phi->setName("scalar.recur"); 3601 3602 // Finally, fix users of the recurrence outside the loop. The users will need 3603 // either the last value of the scalar recurrence or the last value of the 3604 // vector recurrence we extracted in the middle block. Since the loop is in 3605 // LCSSA form, we just need to find all the phi nodes for the original scalar 3606 // recurrence in the exit block, and then add an edge for the middle block. 3607 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3608 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3609 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3610 } 3611 } 3612 } 3613 3614 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3615 Constant *Zero = Builder.getInt32(0); 3616 3617 // Get it's reduction variable descriptor. 3618 assert(Legal->isReductionVariable(Phi) && 3619 "Unable to find the reduction variable"); 3620 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; 3621 3622 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3623 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3624 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3625 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3626 RdxDesc.getMinMaxRecurrenceKind(); 3627 setDebugLocFromInst(Builder, ReductionStartValue); 3628 3629 // We need to generate a reduction vector from the incoming scalar. 3630 // To do so, we need to generate the 'identity' vector and override 3631 // one of the elements with the incoming scalar reduction. We need 3632 // to do it in the vector-loop preheader. 3633 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3634 3635 // This is the vector-clone of the value that leaves the loop. 3636 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3637 3638 // Find the reduction identity variable. Zero for addition, or, xor, 3639 // one for multiplication, -1 for And. 3640 Value *Identity; 3641 Value *VectorStart; 3642 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3643 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3644 // MinMax reduction have the start value as their identify. 3645 if (VF == 1) { 3646 VectorStart = Identity = ReductionStartValue; 3647 } else { 3648 VectorStart = Identity = 3649 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3650 } 3651 } else { 3652 // Handle other reduction kinds: 3653 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3654 RK, VecTy->getScalarType()); 3655 if (VF == 1) { 3656 Identity = Iden; 3657 // This vector is the Identity vector where the first element is the 3658 // incoming scalar reduction. 3659 VectorStart = ReductionStartValue; 3660 } else { 3661 Identity = ConstantVector::getSplat(VF, Iden); 3662 3663 // This vector is the Identity vector where the first element is the 3664 // incoming scalar reduction. 3665 VectorStart = 3666 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3667 } 3668 } 3669 3670 // Fix the vector-loop phi. 3671 3672 // Reductions do not have to start at zero. They can start with 3673 // any loop invariant values. 3674 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3675 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3676 for (unsigned Part = 0; Part < UF; ++Part) { 3677 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3678 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3679 // Make sure to add the reduction stat value only to the 3680 // first unroll part. 3681 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3682 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3683 cast<PHINode>(VecRdxPhi) 3684 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3685 } 3686 3687 // Before each round, move the insertion point right between 3688 // the PHIs and the values we are going to write. 3689 // This allows us to write both PHINodes and the extractelement 3690 // instructions. 3691 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3692 3693 setDebugLocFromInst(Builder, LoopExitInst); 3694 3695 // If tail is folded by masking, the vector value to leave the loop should be 3696 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3697 // instead of the former. 3698 if (Cost->foldTailByMasking()) { 3699 for (unsigned Part = 0; Part < UF; ++Part) { 3700 Value *VecLoopExitInst = 3701 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3702 Value *Sel = nullptr; 3703 for (User *U : VecLoopExitInst->users()) { 3704 if (isa<SelectInst>(U)) { 3705 assert(!Sel && "Reduction exit feeding two selects"); 3706 Sel = U; 3707 } else 3708 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3709 } 3710 assert(Sel && "Reduction exit feeds no select"); 3711 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3712 } 3713 } 3714 3715 // If the vector reduction can be performed in a smaller type, we truncate 3716 // then extend the loop exit value to enable InstCombine to evaluate the 3717 // entire expression in the smaller type. 3718 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3719 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3720 Builder.SetInsertPoint( 3721 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3722 VectorParts RdxParts(UF); 3723 for (unsigned Part = 0; Part < UF; ++Part) { 3724 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3725 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3726 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3727 : Builder.CreateZExt(Trunc, VecTy); 3728 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3729 UI != RdxParts[Part]->user_end();) 3730 if (*UI != Trunc) { 3731 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3732 RdxParts[Part] = Extnd; 3733 } else { 3734 ++UI; 3735 } 3736 } 3737 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3738 for (unsigned Part = 0; Part < UF; ++Part) { 3739 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3740 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3741 } 3742 } 3743 3744 // Reduce all of the unrolled parts into a single vector. 3745 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3746 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3747 3748 // The middle block terminator has already been assigned a DebugLoc here (the 3749 // OrigLoop's single latch terminator). We want the whole middle block to 3750 // appear to execute on this line because: (a) it is all compiler generated, 3751 // (b) these instructions are always executed after evaluating the latch 3752 // conditional branch, and (c) other passes may add new predecessors which 3753 // terminate on this line. This is the easiest way to ensure we don't 3754 // accidentally cause an extra step back into the loop while debugging. 3755 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3756 for (unsigned Part = 1; Part < UF; ++Part) { 3757 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3758 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3759 // Floating point operations had to be 'fast' to enable the reduction. 3760 ReducedPartRdx = addFastMathFlag( 3761 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3762 ReducedPartRdx, "bin.rdx"), 3763 RdxDesc.getFastMathFlags()); 3764 else 3765 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3766 RdxPart); 3767 } 3768 3769 if (VF > 1) { 3770 bool NoNaN = Legal->hasFunNoNaNAttr(); 3771 ReducedPartRdx = 3772 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3773 // If the reduction can be performed in a smaller type, we need to extend 3774 // the reduction to the wider type before we branch to the original loop. 3775 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3776 ReducedPartRdx = 3777 RdxDesc.isSigned() 3778 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3779 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3780 } 3781 3782 // Create a phi node that merges control-flow from the backedge-taken check 3783 // block and the middle block. 3784 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3785 LoopScalarPreHeader->getTerminator()); 3786 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3787 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3788 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3789 3790 // Now, we need to fix the users of the reduction variable 3791 // inside and outside of the scalar remainder loop. 3792 // We know that the loop is in LCSSA form. We need to update the 3793 // PHI nodes in the exit blocks. 3794 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3795 // All PHINodes need to have a single entry edge, or two if 3796 // we already fixed them. 3797 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3798 3799 // We found a reduction value exit-PHI. Update it with the 3800 // incoming bypass edge. 3801 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3802 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3803 } // end of the LCSSA phi scan. 3804 3805 // Fix the scalar loop reduction variable with the incoming reduction sum 3806 // from the vector body and from the backedge value. 3807 int IncomingEdgeBlockIdx = 3808 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3809 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3810 // Pick the other block. 3811 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3812 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3813 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3814 } 3815 3816 void InnerLoopVectorizer::fixLCSSAPHIs() { 3817 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3818 if (LCSSAPhi.getNumIncomingValues() == 1) { 3819 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3820 // Non-instruction incoming values will have only one value. 3821 unsigned LastLane = 0; 3822 if (isa<Instruction>(IncomingValue)) 3823 LastLane = Cost->isUniformAfterVectorization( 3824 cast<Instruction>(IncomingValue), VF) 3825 ? 0 3826 : VF - 1; 3827 // Can be a loop invariant incoming value or the last scalar value to be 3828 // extracted from the vectorized loop. 3829 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3830 Value *lastIncomingValue = 3831 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3832 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3833 } 3834 } 3835 } 3836 3837 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3838 // The basic block and loop containing the predicated instruction. 3839 auto *PredBB = PredInst->getParent(); 3840 auto *VectorLoop = LI->getLoopFor(PredBB); 3841 3842 // Initialize a worklist with the operands of the predicated instruction. 3843 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3844 3845 // Holds instructions that we need to analyze again. An instruction may be 3846 // reanalyzed if we don't yet know if we can sink it or not. 3847 SmallVector<Instruction *, 8> InstsToReanalyze; 3848 3849 // Returns true if a given use occurs in the predicated block. Phi nodes use 3850 // their operands in their corresponding predecessor blocks. 3851 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3852 auto *I = cast<Instruction>(U.getUser()); 3853 BasicBlock *BB = I->getParent(); 3854 if (auto *Phi = dyn_cast<PHINode>(I)) 3855 BB = Phi->getIncomingBlock( 3856 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3857 return BB == PredBB; 3858 }; 3859 3860 // Iteratively sink the scalarized operands of the predicated instruction 3861 // into the block we created for it. When an instruction is sunk, it's 3862 // operands are then added to the worklist. The algorithm ends after one pass 3863 // through the worklist doesn't sink a single instruction. 3864 bool Changed; 3865 do { 3866 // Add the instructions that need to be reanalyzed to the worklist, and 3867 // reset the changed indicator. 3868 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3869 InstsToReanalyze.clear(); 3870 Changed = false; 3871 3872 while (!Worklist.empty()) { 3873 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3874 3875 // We can't sink an instruction if it is a phi node, is already in the 3876 // predicated block, is not in the loop, or may have side effects. 3877 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 3878 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 3879 continue; 3880 3881 // It's legal to sink the instruction if all its uses occur in the 3882 // predicated block. Otherwise, there's nothing to do yet, and we may 3883 // need to reanalyze the instruction. 3884 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 3885 InstsToReanalyze.push_back(I); 3886 continue; 3887 } 3888 3889 // Move the instruction to the beginning of the predicated block, and add 3890 // it's operands to the worklist. 3891 I->moveBefore(&*PredBB->getFirstInsertionPt()); 3892 Worklist.insert(I->op_begin(), I->op_end()); 3893 3894 // The sinking may have enabled other instructions to be sunk, so we will 3895 // need to iterate. 3896 Changed = true; 3897 } 3898 } while (Changed); 3899 } 3900 3901 void InnerLoopVectorizer::fixNonInductionPHIs() { 3902 for (PHINode *OrigPhi : OrigPHIsToFix) { 3903 PHINode *NewPhi = 3904 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 3905 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 3906 3907 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 3908 predecessors(OrigPhi->getParent())); 3909 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 3910 predecessors(NewPhi->getParent())); 3911 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 3912 "Scalar and Vector BB should have the same number of predecessors"); 3913 3914 // The insertion point in Builder may be invalidated by the time we get 3915 // here. Force the Builder insertion point to something valid so that we do 3916 // not run into issues during insertion point restore in 3917 // getOrCreateVectorValue calls below. 3918 Builder.SetInsertPoint(NewPhi); 3919 3920 // The predecessor order is preserved and we can rely on mapping between 3921 // scalar and vector block predecessors. 3922 for (unsigned i = 0; i < NumIncomingValues; ++i) { 3923 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 3924 3925 // When looking up the new scalar/vector values to fix up, use incoming 3926 // values from original phi. 3927 Value *ScIncV = 3928 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 3929 3930 // Scalar incoming value may need a broadcast 3931 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 3932 NewPhi->addIncoming(NewIncV, NewPredBB); 3933 } 3934 } 3935 } 3936 3937 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 3938 unsigned VF) { 3939 PHINode *P = cast<PHINode>(PN); 3940 if (EnableVPlanNativePath) { 3941 // Currently we enter here in the VPlan-native path for non-induction 3942 // PHIs where all control flow is uniform. We simply widen these PHIs. 3943 // Create a vector phi with no operands - the vector phi operands will be 3944 // set at the end of vector code generation. 3945 Type *VecTy = 3946 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 3947 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 3948 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 3949 OrigPHIsToFix.push_back(P); 3950 3951 return; 3952 } 3953 3954 assert(PN->getParent() == OrigLoop->getHeader() && 3955 "Non-header phis should have been handled elsewhere"); 3956 3957 // In order to support recurrences we need to be able to vectorize Phi nodes. 3958 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3959 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 3960 // this value when we vectorize all of the instructions that use the PHI. 3961 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 3962 for (unsigned Part = 0; Part < UF; ++Part) { 3963 // This is phase one of vectorizing PHIs. 3964 Type *VecTy = 3965 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 3966 Value *EntryPart = PHINode::Create( 3967 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 3968 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 3969 } 3970 return; 3971 } 3972 3973 setDebugLocFromInst(Builder, P); 3974 3975 // This PHINode must be an induction variable. 3976 // Make sure that we know about it. 3977 assert(Legal->getInductionVars()->count(P) && "Not an induction variable"); 3978 3979 InductionDescriptor II = Legal->getInductionVars()->lookup(P); 3980 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 3981 3982 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 3983 // which can be found from the original scalar operations. 3984 switch (II.getKind()) { 3985 case InductionDescriptor::IK_NoInduction: 3986 llvm_unreachable("Unknown induction"); 3987 case InductionDescriptor::IK_IntInduction: 3988 case InductionDescriptor::IK_FpInduction: 3989 llvm_unreachable("Integer/fp induction is handled elsewhere."); 3990 case InductionDescriptor::IK_PtrInduction: { 3991 // Handle the pointer induction variable case. 3992 assert(P->getType()->isPointerTy() && "Unexpected type."); 3993 // This is the normalized GEP that starts counting at zero. 3994 Value *PtrInd = Induction; 3995 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 3996 // Determine the number of scalars we need to generate for each unroll 3997 // iteration. If the instruction is uniform, we only need to generate the 3998 // first lane. Otherwise, we generate all VF values. 3999 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4000 // These are the scalar results. Notice that we don't generate vector GEPs 4001 // because scalar GEPs result in better code. 4002 for (unsigned Part = 0; Part < UF; ++Part) { 4003 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4004 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4005 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4006 Value *SclrGep = 4007 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4008 SclrGep->setName("next.gep"); 4009 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4010 } 4011 } 4012 return; 4013 } 4014 } 4015 } 4016 4017 /// A helper function for checking whether an integer division-related 4018 /// instruction may divide by zero (in which case it must be predicated if 4019 /// executed conditionally in the scalar code). 4020 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4021 /// Non-zero divisors that are non compile-time constants will not be 4022 /// converted into multiplication, so we will still end up scalarizing 4023 /// the division, but can do so w/o predication. 4024 static bool mayDivideByZero(Instruction &I) { 4025 assert((I.getOpcode() == Instruction::UDiv || 4026 I.getOpcode() == Instruction::SDiv || 4027 I.getOpcode() == Instruction::URem || 4028 I.getOpcode() == Instruction::SRem) && 4029 "Unexpected instruction"); 4030 Value *Divisor = I.getOperand(1); 4031 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4032 return !CInt || CInt->isZero(); 4033 } 4034 4035 void InnerLoopVectorizer::widenInstruction(Instruction &I) { 4036 switch (I.getOpcode()) { 4037 case Instruction::Br: 4038 case Instruction::PHI: 4039 llvm_unreachable("This instruction is handled by a different recipe."); 4040 case Instruction::GetElementPtr: { 4041 // Construct a vector GEP by widening the operands of the scalar GEP as 4042 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4043 // results in a vector of pointers when at least one operand of the GEP 4044 // is vector-typed. Thus, to keep the representation compact, we only use 4045 // vector-typed operands for loop-varying values. 4046 auto *GEP = cast<GetElementPtrInst>(&I); 4047 4048 if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) { 4049 // If we are vectorizing, but the GEP has only loop-invariant operands, 4050 // the GEP we build (by only using vector-typed operands for 4051 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4052 // produce a vector of pointers, we need to either arbitrarily pick an 4053 // operand to broadcast, or broadcast a clone of the original GEP. 4054 // Here, we broadcast a clone of the original. 4055 // 4056 // TODO: If at some point we decide to scalarize instructions having 4057 // loop-invariant operands, this special case will no longer be 4058 // required. We would add the scalarization decision to 4059 // collectLoopScalars() and teach getVectorValue() to broadcast 4060 // the lane-zero scalar value. 4061 auto *Clone = Builder.Insert(GEP->clone()); 4062 for (unsigned Part = 0; Part < UF; ++Part) { 4063 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4064 VectorLoopValueMap.setVectorValue(&I, Part, EntryPart); 4065 addMetadata(EntryPart, GEP); 4066 } 4067 } else { 4068 // If the GEP has at least one loop-varying operand, we are sure to 4069 // produce a vector of pointers. But if we are only unrolling, we want 4070 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4071 // produce with the code below will be scalar (if VF == 1) or vector 4072 // (otherwise). Note that for the unroll-only case, we still maintain 4073 // values in the vector mapping with initVector, as we do for other 4074 // instructions. 4075 for (unsigned Part = 0; Part < UF; ++Part) { 4076 // The pointer operand of the new GEP. If it's loop-invariant, we 4077 // won't broadcast it. 4078 auto *Ptr = 4079 OrigLoop->isLoopInvariant(GEP->getPointerOperand()) 4080 ? GEP->getPointerOperand() 4081 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4082 4083 // Collect all the indices for the new GEP. If any index is 4084 // loop-invariant, we won't broadcast it. 4085 SmallVector<Value *, 4> Indices; 4086 for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) { 4087 if (OrigLoop->isLoopInvariant(U.get())) 4088 Indices.push_back(U.get()); 4089 else 4090 Indices.push_back(getOrCreateVectorValue(U.get(), Part)); 4091 } 4092 4093 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4094 // but it should be a vector, otherwise. 4095 auto *NewGEP = 4096 GEP->isInBounds() 4097 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4098 Indices) 4099 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4100 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4101 "NewGEP is not a pointer vector"); 4102 VectorLoopValueMap.setVectorValue(&I, Part, NewGEP); 4103 addMetadata(NewGEP, GEP); 4104 } 4105 } 4106 4107 break; 4108 } 4109 case Instruction::UDiv: 4110 case Instruction::SDiv: 4111 case Instruction::SRem: 4112 case Instruction::URem: 4113 case Instruction::Add: 4114 case Instruction::FAdd: 4115 case Instruction::Sub: 4116 case Instruction::FSub: 4117 case Instruction::FNeg: 4118 case Instruction::Mul: 4119 case Instruction::FMul: 4120 case Instruction::FDiv: 4121 case Instruction::FRem: 4122 case Instruction::Shl: 4123 case Instruction::LShr: 4124 case Instruction::AShr: 4125 case Instruction::And: 4126 case Instruction::Or: 4127 case Instruction::Xor: { 4128 // Just widen unops and binops. 4129 setDebugLocFromInst(Builder, &I); 4130 4131 for (unsigned Part = 0; Part < UF; ++Part) { 4132 SmallVector<Value *, 2> Ops; 4133 for (Value *Op : I.operands()) 4134 Ops.push_back(getOrCreateVectorValue(Op, Part)); 4135 4136 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4137 4138 if (auto *VecOp = dyn_cast<Instruction>(V)) 4139 VecOp->copyIRFlags(&I); 4140 4141 // Use this vector value for all users of the original instruction. 4142 VectorLoopValueMap.setVectorValue(&I, Part, V); 4143 addMetadata(V, &I); 4144 } 4145 4146 break; 4147 } 4148 case Instruction::Select: { 4149 // Widen selects. 4150 // If the selector is loop invariant we can create a select 4151 // instruction with a scalar condition. Otherwise, use vector-select. 4152 auto *SE = PSE.getSE(); 4153 bool InvariantCond = 4154 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); 4155 setDebugLocFromInst(Builder, &I); 4156 4157 // The condition can be loop invariant but still defined inside the 4158 // loop. This means that we can't just use the original 'cond' value. 4159 // We have to take the 'vectorized' value and pick the first lane. 4160 // Instcombine will make this a no-op. 4161 4162 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); 4163 4164 for (unsigned Part = 0; Part < UF; ++Part) { 4165 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); 4166 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); 4167 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); 4168 Value *Sel = 4169 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); 4170 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4171 addMetadata(Sel, &I); 4172 } 4173 4174 break; 4175 } 4176 4177 case Instruction::ICmp: 4178 case Instruction::FCmp: { 4179 // Widen compares. Generate vector compares. 4180 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4181 auto *Cmp = cast<CmpInst>(&I); 4182 setDebugLocFromInst(Builder, Cmp); 4183 for (unsigned Part = 0; Part < UF; ++Part) { 4184 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); 4185 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); 4186 Value *C = nullptr; 4187 if (FCmp) { 4188 // Propagate fast math flags. 4189 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4190 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4191 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4192 } else { 4193 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4194 } 4195 VectorLoopValueMap.setVectorValue(&I, Part, C); 4196 addMetadata(C, &I); 4197 } 4198 4199 break; 4200 } 4201 4202 case Instruction::ZExt: 4203 case Instruction::SExt: 4204 case Instruction::FPToUI: 4205 case Instruction::FPToSI: 4206 case Instruction::FPExt: 4207 case Instruction::PtrToInt: 4208 case Instruction::IntToPtr: 4209 case Instruction::SIToFP: 4210 case Instruction::UIToFP: 4211 case Instruction::Trunc: 4212 case Instruction::FPTrunc: 4213 case Instruction::BitCast: { 4214 auto *CI = cast<CastInst>(&I); 4215 setDebugLocFromInst(Builder, CI); 4216 4217 /// Vectorize casts. 4218 Type *DestTy = 4219 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4220 4221 for (unsigned Part = 0; Part < UF; ++Part) { 4222 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); 4223 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4224 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4225 addMetadata(Cast, &I); 4226 } 4227 break; 4228 } 4229 4230 case Instruction::Call: { 4231 // Ignore dbg intrinsics. 4232 if (isa<DbgInfoIntrinsic>(I)) 4233 break; 4234 setDebugLocFromInst(Builder, &I); 4235 4236 Module *M = I.getParent()->getParent()->getParent(); 4237 auto *CI = cast<CallInst>(&I); 4238 4239 StringRef FnName = CI->getCalledFunction()->getName(); 4240 Function *F = CI->getCalledFunction(); 4241 Type *RetTy = ToVectorTy(CI->getType(), VF); 4242 SmallVector<Type *, 4> Tys; 4243 for (Value *ArgOperand : CI->arg_operands()) 4244 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4245 4246 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4247 4248 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4249 // version of the instruction. 4250 // Is it beneficial to perform intrinsic call compared to lib call? 4251 bool NeedToScalarize; 4252 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4253 bool UseVectorIntrinsic = 4254 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4255 assert((UseVectorIntrinsic || !NeedToScalarize) && 4256 "Instruction should be scalarized elsewhere."); 4257 4258 for (unsigned Part = 0; Part < UF; ++Part) { 4259 SmallVector<Value *, 4> Args; 4260 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 4261 Value *Arg = CI->getArgOperand(i); 4262 // Some intrinsics have a scalar argument - don't replace it with a 4263 // vector. 4264 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) 4265 Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); 4266 Args.push_back(Arg); 4267 } 4268 4269 Function *VectorF; 4270 if (UseVectorIntrinsic) { 4271 // Use vector version of the intrinsic. 4272 Type *TysForDecl[] = {CI->getType()}; 4273 if (VF > 1) 4274 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4275 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4276 } else { 4277 // Use vector version of the library call. 4278 StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); 4279 assert(!VFnName.empty() && "Vector function name is empty."); 4280 VectorF = M->getFunction(VFnName); 4281 if (!VectorF) { 4282 // Generate a declaration 4283 FunctionType *FTy = FunctionType::get(RetTy, Tys, false); 4284 VectorF = 4285 Function::Create(FTy, Function::ExternalLinkage, VFnName, M); 4286 VectorF->copyAttributesFrom(F); 4287 } 4288 } 4289 assert(VectorF && "Can't create vector function."); 4290 4291 SmallVector<OperandBundleDef, 1> OpBundles; 4292 CI->getOperandBundlesAsDefs(OpBundles); 4293 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4294 4295 if (isa<FPMathOperator>(V)) 4296 V->copyFastMathFlags(CI); 4297 4298 VectorLoopValueMap.setVectorValue(&I, Part, V); 4299 addMetadata(V, &I); 4300 } 4301 4302 break; 4303 } 4304 4305 default: 4306 // This instruction is not vectorized by simple widening. 4307 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4308 llvm_unreachable("Unhandled instruction!"); 4309 } // end of switch. 4310 } 4311 4312 void InnerLoopVectorizer::updateAnalysis() { 4313 // Forget the original basic block. 4314 PSE.getSE()->forgetLoop(OrigLoop); 4315 4316 // DT is not kept up-to-date for outer loop vectorization 4317 if (EnableVPlanNativePath) 4318 return; 4319 4320 // Update the dominator tree information. 4321 assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && 4322 "Entry does not dominate exit."); 4323 4324 DT->addNewBlock(LoopMiddleBlock, 4325 LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4326 DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); 4327 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 4328 DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); 4329 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 4330 } 4331 4332 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4333 // We should not collect Scalars more than once per VF. Right now, this 4334 // function is called from collectUniformsAndScalars(), which already does 4335 // this check. Collecting Scalars for VF=1 does not make any sense. 4336 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4337 "This function should not be visited twice for the same VF"); 4338 4339 SmallSetVector<Instruction *, 8> Worklist; 4340 4341 // These sets are used to seed the analysis with pointers used by memory 4342 // accesses that will remain scalar. 4343 SmallSetVector<Instruction *, 8> ScalarPtrs; 4344 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4345 4346 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4347 // The pointer operands of loads and stores will be scalar as long as the 4348 // memory access is not a gather or scatter operation. The value operand of a 4349 // store will remain scalar if the store is scalarized. 4350 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4351 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4352 assert(WideningDecision != CM_Unknown && 4353 "Widening decision should be ready at this moment"); 4354 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4355 if (Ptr == Store->getValueOperand()) 4356 return WideningDecision == CM_Scalarize; 4357 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4358 "Ptr is neither a value or pointer operand"); 4359 return WideningDecision != CM_GatherScatter; 4360 }; 4361 4362 // A helper that returns true if the given value is a bitcast or 4363 // getelementptr instruction contained in the loop. 4364 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4365 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4366 isa<GetElementPtrInst>(V)) && 4367 !TheLoop->isLoopInvariant(V); 4368 }; 4369 4370 // A helper that evaluates a memory access's use of a pointer. If the use 4371 // will be a scalar use, and the pointer is only used by memory accesses, we 4372 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4373 // PossibleNonScalarPtrs. 4374 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4375 // We only care about bitcast and getelementptr instructions contained in 4376 // the loop. 4377 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4378 return; 4379 4380 // If the pointer has already been identified as scalar (e.g., if it was 4381 // also identified as uniform), there's nothing to do. 4382 auto *I = cast<Instruction>(Ptr); 4383 if (Worklist.count(I)) 4384 return; 4385 4386 // If the use of the pointer will be a scalar use, and all users of the 4387 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4388 // place the pointer in PossibleNonScalarPtrs. 4389 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4390 return isa<LoadInst>(U) || isa<StoreInst>(U); 4391 })) 4392 ScalarPtrs.insert(I); 4393 else 4394 PossibleNonScalarPtrs.insert(I); 4395 }; 4396 4397 // We seed the scalars analysis with three classes of instructions: (1) 4398 // instructions marked uniform-after-vectorization, (2) bitcast and 4399 // getelementptr instructions used by memory accesses requiring a scalar use, 4400 // and (3) pointer induction variables and their update instructions (we 4401 // currently only scalarize these). 4402 // 4403 // (1) Add to the worklist all instructions that have been identified as 4404 // uniform-after-vectorization. 4405 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4406 4407 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4408 // memory accesses requiring a scalar use. The pointer operands of loads and 4409 // stores will be scalar as long as the memory accesses is not a gather or 4410 // scatter operation. The value operand of a store will remain scalar if the 4411 // store is scalarized. 4412 for (auto *BB : TheLoop->blocks()) 4413 for (auto &I : *BB) { 4414 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4415 evaluatePtrUse(Load, Load->getPointerOperand()); 4416 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4417 evaluatePtrUse(Store, Store->getPointerOperand()); 4418 evaluatePtrUse(Store, Store->getValueOperand()); 4419 } 4420 } 4421 for (auto *I : ScalarPtrs) 4422 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { 4423 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4424 Worklist.insert(I); 4425 } 4426 4427 // (3) Add to the worklist all pointer induction variables and their update 4428 // instructions. 4429 // 4430 // TODO: Once we are able to vectorize pointer induction variables we should 4431 // no longer insert them into the worklist here. 4432 auto *Latch = TheLoop->getLoopLatch(); 4433 for (auto &Induction : *Legal->getInductionVars()) { 4434 auto *Ind = Induction.first; 4435 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4436 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4437 continue; 4438 Worklist.insert(Ind); 4439 Worklist.insert(IndUpdate); 4440 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4441 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4442 << "\n"); 4443 } 4444 4445 // Insert the forced scalars. 4446 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4447 // induction variable when the PHI user is scalarized. 4448 auto ForcedScalar = ForcedScalars.find(VF); 4449 if (ForcedScalar != ForcedScalars.end()) 4450 for (auto *I : ForcedScalar->second) 4451 Worklist.insert(I); 4452 4453 // Expand the worklist by looking through any bitcasts and getelementptr 4454 // instructions we've already identified as scalar. This is similar to the 4455 // expansion step in collectLoopUniforms(); however, here we're only 4456 // expanding to include additional bitcasts and getelementptr instructions. 4457 unsigned Idx = 0; 4458 while (Idx != Worklist.size()) { 4459 Instruction *Dst = Worklist[Idx++]; 4460 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4461 continue; 4462 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4463 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4464 auto *J = cast<Instruction>(U); 4465 return !TheLoop->contains(J) || Worklist.count(J) || 4466 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4467 isScalarUse(J, Src)); 4468 })) { 4469 Worklist.insert(Src); 4470 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4471 } 4472 } 4473 4474 // An induction variable will remain scalar if all users of the induction 4475 // variable and induction variable update remain scalar. 4476 for (auto &Induction : *Legal->getInductionVars()) { 4477 auto *Ind = Induction.first; 4478 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4479 4480 // We already considered pointer induction variables, so there's no reason 4481 // to look at their users again. 4482 // 4483 // TODO: Once we are able to vectorize pointer induction variables we 4484 // should no longer skip over them here. 4485 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4486 continue; 4487 4488 // Determine if all users of the induction variable are scalar after 4489 // vectorization. 4490 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4491 auto *I = cast<Instruction>(U); 4492 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4493 }); 4494 if (!ScalarInd) 4495 continue; 4496 4497 // Determine if all users of the induction variable update instruction are 4498 // scalar after vectorization. 4499 auto ScalarIndUpdate = 4500 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4501 auto *I = cast<Instruction>(U); 4502 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4503 }); 4504 if (!ScalarIndUpdate) 4505 continue; 4506 4507 // The induction variable and its update instruction will remain scalar. 4508 Worklist.insert(Ind); 4509 Worklist.insert(IndUpdate); 4510 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4511 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4512 << "\n"); 4513 } 4514 4515 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4516 } 4517 4518 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4519 if (!blockNeedsPredication(I->getParent())) 4520 return false; 4521 switch(I->getOpcode()) { 4522 default: 4523 break; 4524 case Instruction::Load: 4525 case Instruction::Store: { 4526 if (!Legal->isMaskRequired(I)) 4527 return false; 4528 auto *Ptr = getLoadStorePointerOperand(I); 4529 auto *Ty = getMemInstValueType(I); 4530 // We have already decided how to vectorize this instruction, get that 4531 // result. 4532 if (VF > 1) { 4533 InstWidening WideningDecision = getWideningDecision(I, VF); 4534 assert(WideningDecision != CM_Unknown && 4535 "Widening decision should be ready at this moment"); 4536 return WideningDecision == CM_Scalarize; 4537 } 4538 return isa<LoadInst>(I) ? 4539 !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty)) 4540 : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty)); 4541 } 4542 case Instruction::UDiv: 4543 case Instruction::SDiv: 4544 case Instruction::SRem: 4545 case Instruction::URem: 4546 return mayDivideByZero(*I); 4547 } 4548 return false; 4549 } 4550 4551 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4552 unsigned VF) { 4553 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4554 assert(getWideningDecision(I, VF) == CM_Unknown && 4555 "Decision should not be set yet."); 4556 auto *Group = getInterleavedAccessGroup(I); 4557 assert(Group && "Must have a group."); 4558 4559 // If the instruction's allocated size doesn't equal it's type size, it 4560 // requires padding and will be scalarized. 4561 auto &DL = I->getModule()->getDataLayout(); 4562 auto *ScalarTy = getMemInstValueType(I); 4563 if (hasIrregularType(ScalarTy, DL, VF)) 4564 return false; 4565 4566 // Check if masking is required. 4567 // A Group may need masking for one of two reasons: it resides in a block that 4568 // needs predication, or it was decided to use masking to deal with gaps. 4569 bool PredicatedAccessRequiresMasking = 4570 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4571 bool AccessWithGapsRequiresMasking = 4572 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4573 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4574 return true; 4575 4576 // If masked interleaving is required, we expect that the user/target had 4577 // enabled it, because otherwise it either wouldn't have been created or 4578 // it should have been invalidated by the CostModel. 4579 assert(useMaskedInterleavedAccesses(TTI) && 4580 "Masked interleave-groups for predicated accesses are not enabled."); 4581 4582 auto *Ty = getMemInstValueType(I); 4583 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty) 4584 : TTI.isLegalMaskedStore(Ty); 4585 } 4586 4587 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4588 unsigned VF) { 4589 // Get and ensure we have a valid memory instruction. 4590 LoadInst *LI = dyn_cast<LoadInst>(I); 4591 StoreInst *SI = dyn_cast<StoreInst>(I); 4592 assert((LI || SI) && "Invalid memory instruction"); 4593 4594 auto *Ptr = getLoadStorePointerOperand(I); 4595 4596 // In order to be widened, the pointer should be consecutive, first of all. 4597 if (!Legal->isConsecutivePtr(Ptr)) 4598 return false; 4599 4600 // If the instruction is a store located in a predicated block, it will be 4601 // scalarized. 4602 if (isScalarWithPredication(I)) 4603 return false; 4604 4605 // If the instruction's allocated size doesn't equal it's type size, it 4606 // requires padding and will be scalarized. 4607 auto &DL = I->getModule()->getDataLayout(); 4608 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4609 if (hasIrregularType(ScalarTy, DL, VF)) 4610 return false; 4611 4612 return true; 4613 } 4614 4615 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4616 // We should not collect Uniforms more than once per VF. Right now, 4617 // this function is called from collectUniformsAndScalars(), which 4618 // already does this check. Collecting Uniforms for VF=1 does not make any 4619 // sense. 4620 4621 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4622 "This function should not be visited twice for the same VF"); 4623 4624 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4625 // not analyze again. Uniforms.count(VF) will return 1. 4626 Uniforms[VF].clear(); 4627 4628 // We now know that the loop is vectorizable! 4629 // Collect instructions inside the loop that will remain uniform after 4630 // vectorization. 4631 4632 // Global values, params and instructions outside of current loop are out of 4633 // scope. 4634 auto isOutOfScope = [&](Value *V) -> bool { 4635 Instruction *I = dyn_cast<Instruction>(V); 4636 return (!I || !TheLoop->contains(I)); 4637 }; 4638 4639 SetVector<Instruction *> Worklist; 4640 BasicBlock *Latch = TheLoop->getLoopLatch(); 4641 4642 // Start with the conditional branch. If the branch condition is an 4643 // instruction contained in the loop that is only used by the branch, it is 4644 // uniform. 4645 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4646 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) { 4647 Worklist.insert(Cmp); 4648 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); 4649 } 4650 4651 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4652 // are pointers that are treated like consecutive pointers during 4653 // vectorization. The pointer operands of interleaved accesses are an 4654 // example. 4655 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4656 4657 // Holds pointer operands of instructions that are possibly non-uniform. 4658 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4659 4660 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4661 InstWidening WideningDecision = getWideningDecision(I, VF); 4662 assert(WideningDecision != CM_Unknown && 4663 "Widening decision should be ready at this moment"); 4664 4665 return (WideningDecision == CM_Widen || 4666 WideningDecision == CM_Widen_Reverse || 4667 WideningDecision == CM_Interleave); 4668 }; 4669 // Iterate over the instructions in the loop, and collect all 4670 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4671 // that a consecutive-like pointer operand will be scalarized, we collect it 4672 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4673 // getelementptr instruction can be used by both vectorized and scalarized 4674 // memory instructions. For example, if a loop loads and stores from the same 4675 // location, but the store is conditional, the store will be scalarized, and 4676 // the getelementptr won't remain uniform. 4677 for (auto *BB : TheLoop->blocks()) 4678 for (auto &I : *BB) { 4679 // If there's no pointer operand, there's nothing to do. 4680 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4681 if (!Ptr) 4682 continue; 4683 4684 // True if all users of Ptr are memory accesses that have Ptr as their 4685 // pointer operand. 4686 auto UsersAreMemAccesses = 4687 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4688 return getLoadStorePointerOperand(U) == Ptr; 4689 }); 4690 4691 // Ensure the memory instruction will not be scalarized or used by 4692 // gather/scatter, making its pointer operand non-uniform. If the pointer 4693 // operand is used by any instruction other than a memory access, we 4694 // conservatively assume the pointer operand may be non-uniform. 4695 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4696 PossibleNonUniformPtrs.insert(Ptr); 4697 4698 // If the memory instruction will be vectorized and its pointer operand 4699 // is consecutive-like, or interleaving - the pointer operand should 4700 // remain uniform. 4701 else 4702 ConsecutiveLikePtrs.insert(Ptr); 4703 } 4704 4705 // Add to the Worklist all consecutive and consecutive-like pointers that 4706 // aren't also identified as possibly non-uniform. 4707 for (auto *V : ConsecutiveLikePtrs) 4708 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) { 4709 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n"); 4710 Worklist.insert(V); 4711 } 4712 4713 // Expand Worklist in topological order: whenever a new instruction 4714 // is added , its users should be already inside Worklist. It ensures 4715 // a uniform instruction will only be used by uniform instructions. 4716 unsigned idx = 0; 4717 while (idx != Worklist.size()) { 4718 Instruction *I = Worklist[idx++]; 4719 4720 for (auto OV : I->operand_values()) { 4721 // isOutOfScope operands cannot be uniform instructions. 4722 if (isOutOfScope(OV)) 4723 continue; 4724 // First order recurrence Phi's should typically be considered 4725 // non-uniform. 4726 auto *OP = dyn_cast<PHINode>(OV); 4727 if (OP && Legal->isFirstOrderRecurrence(OP)) 4728 continue; 4729 // If all the users of the operand are uniform, then add the 4730 // operand into the uniform worklist. 4731 auto *OI = cast<Instruction>(OV); 4732 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4733 auto *J = cast<Instruction>(U); 4734 return Worklist.count(J) || 4735 (OI == getLoadStorePointerOperand(J) && 4736 isUniformDecision(J, VF)); 4737 })) { 4738 Worklist.insert(OI); 4739 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); 4740 } 4741 } 4742 } 4743 4744 // Returns true if Ptr is the pointer operand of a memory access instruction 4745 // I, and I is known to not require scalarization. 4746 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4747 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4748 }; 4749 4750 // For an instruction to be added into Worklist above, all its users inside 4751 // the loop should also be in Worklist. However, this condition cannot be 4752 // true for phi nodes that form a cyclic dependence. We must process phi 4753 // nodes separately. An induction variable will remain uniform if all users 4754 // of the induction variable and induction variable update remain uniform. 4755 // The code below handles both pointer and non-pointer induction variables. 4756 for (auto &Induction : *Legal->getInductionVars()) { 4757 auto *Ind = Induction.first; 4758 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4759 4760 // Determine if all users of the induction variable are uniform after 4761 // vectorization. 4762 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4763 auto *I = cast<Instruction>(U); 4764 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4765 isVectorizedMemAccessUse(I, Ind); 4766 }); 4767 if (!UniformInd) 4768 continue; 4769 4770 // Determine if all users of the induction variable update instruction are 4771 // uniform after vectorization. 4772 auto UniformIndUpdate = 4773 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4774 auto *I = cast<Instruction>(U); 4775 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4776 isVectorizedMemAccessUse(I, IndUpdate); 4777 }); 4778 if (!UniformIndUpdate) 4779 continue; 4780 4781 // The induction variable and its update instruction will remain uniform. 4782 Worklist.insert(Ind); 4783 Worklist.insert(IndUpdate); 4784 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n"); 4785 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate 4786 << "\n"); 4787 } 4788 4789 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4790 } 4791 4792 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4793 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4794 4795 if (Legal->getRuntimePointerChecking()->Need) { 4796 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4797 "runtime pointer checks needed. Enable vectorization of this " 4798 "loop with '#pragma clang loop vectorize(enable)' when " 4799 "compiling with -Os/-Oz", 4800 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4801 return true; 4802 } 4803 4804 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4805 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4806 "runtime SCEV checks needed. Enable vectorization of this " 4807 "loop with '#pragma clang loop vectorize(enable)' when " 4808 "compiling with -Os/-Oz", 4809 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4810 return true; 4811 } 4812 4813 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4814 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4815 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4816 "runtime stride == 1 checks needed. Enable vectorization of " 4817 "this loop with '#pragma clang loop vectorize(enable)' when " 4818 "compiling with -Os/-Oz", 4819 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4820 return true; 4821 } 4822 4823 return false; 4824 } 4825 4826 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { 4827 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4828 // TODO: It may by useful to do since it's still likely to be dynamically 4829 // uniform if the target can skip. 4830 reportVectorizationFailure( 4831 "Not inserting runtime ptr check for divergent target", 4832 "runtime pointer checks needed. Not enabled for divergent target", 4833 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4834 return None; 4835 } 4836 4837 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4838 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4839 if (TC == 1) { 4840 reportVectorizationFailure("Single iteration (non) loop", 4841 "loop trip count is one, irrelevant for vectorization", 4842 "SingleIterationLoop", ORE, TheLoop); 4843 return None; 4844 } 4845 4846 switch (ScalarEpilogueStatus) { 4847 case CM_ScalarEpilogueAllowed: 4848 return computeFeasibleMaxVF(TC); 4849 case CM_ScalarEpilogueNotNeededUsePredicate: 4850 LLVM_DEBUG( 4851 dbgs() << "LV: vector predicate hint/switch found.\n" 4852 << "LV: Not allowing scalar epilogue, creating predicated " 4853 << "vector loop.\n"); 4854 break; 4855 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4856 // fallthrough as a special case of OptForSize 4857 case CM_ScalarEpilogueNotAllowedOptSize: 4858 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4859 LLVM_DEBUG( 4860 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4861 else 4862 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4863 << "count.\n"); 4864 4865 // Bail if runtime checks are required, which are not good when optimising 4866 // for size. 4867 if (runtimeChecksRequired()) 4868 return None; 4869 break; 4870 } 4871 4872 // Now try the tail folding 4873 4874 // Invalidate interleave groups that require an epilogue if we can't mask 4875 // the interleave-group. 4876 if (!useMaskedInterleavedAccesses(TTI)) 4877 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4878 4879 unsigned MaxVF = computeFeasibleMaxVF(TC); 4880 if (TC > 0 && TC % MaxVF == 0) { 4881 // Accept MaxVF if we do not have a tail. 4882 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4883 return MaxVF; 4884 } 4885 4886 // If we don't know the precise trip count, or if the trip count that we 4887 // found modulo the vectorization factor is not zero, try to fold the tail 4888 // by masking. 4889 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4890 if (Legal->prepareToFoldTailByMasking()) { 4891 FoldTailByMasking = true; 4892 return MaxVF; 4893 } 4894 4895 if (TC == 0) { 4896 reportVectorizationFailure( 4897 "Unable to calculate the loop count due to complex control flow", 4898 "unable to calculate the loop count due to complex control flow", 4899 "UnknownLoopCountComplexCFG", ORE, TheLoop); 4900 return None; 4901 } 4902 4903 reportVectorizationFailure( 4904 "Cannot optimize for size and vectorize at the same time.", 4905 "cannot optimize for size and vectorize at the same time. " 4906 "Enable vectorization of this loop with '#pragma clang loop " 4907 "vectorize(enable)' when compiling with -Os/-Oz", 4908 "NoTailLoopWithOptForSize", ORE, TheLoop); 4909 return None; 4910 } 4911 4912 unsigned 4913 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 4914 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4915 unsigned SmallestType, WidestType; 4916 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4917 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 4918 4919 // Get the maximum safe dependence distance in bits computed by LAA. 4920 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4921 // the memory accesses that is most restrictive (involved in the smallest 4922 // dependence distance). 4923 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 4924 4925 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 4926 4927 unsigned MaxVectorSize = WidestRegister / WidestType; 4928 4929 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4930 << " / " << WidestType << " bits.\n"); 4931 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 4932 << WidestRegister << " bits.\n"); 4933 4934 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 4935 " into one vector!"); 4936 if (MaxVectorSize == 0) { 4937 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 4938 MaxVectorSize = 1; 4939 return MaxVectorSize; 4940 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 4941 isPowerOf2_32(ConstTripCount)) { 4942 // We need to clamp the VF to be the ConstTripCount. There is no point in 4943 // choosing a higher viable VF as done in the loop below. 4944 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 4945 << ConstTripCount << "\n"); 4946 MaxVectorSize = ConstTripCount; 4947 return MaxVectorSize; 4948 } 4949 4950 unsigned MaxVF = MaxVectorSize; 4951 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 4952 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 4953 // Collect all viable vectorization factors larger than the default MaxVF 4954 // (i.e. MaxVectorSize). 4955 SmallVector<unsigned, 8> VFs; 4956 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 4957 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 4958 VFs.push_back(VS); 4959 4960 // For each VF calculate its register usage. 4961 auto RUs = calculateRegisterUsage(VFs); 4962 4963 // Select the largest VF which doesn't require more registers than existing 4964 // ones. 4965 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); 4966 for (int i = RUs.size() - 1; i >= 0; --i) { 4967 if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { 4968 MaxVF = VFs[i]; 4969 break; 4970 } 4971 } 4972 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 4973 if (MaxVF < MinVF) { 4974 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 4975 << ") with target's minimum: " << MinVF << '\n'); 4976 MaxVF = MinVF; 4977 } 4978 } 4979 } 4980 return MaxVF; 4981 } 4982 4983 VectorizationFactor 4984 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 4985 float Cost = expectedCost(1).first; 4986 const float ScalarCost = Cost; 4987 unsigned Width = 1; 4988 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 4989 4990 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 4991 if (ForceVectorization && MaxVF > 1) { 4992 // Ignore scalar width, because the user explicitly wants vectorization. 4993 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 4994 // evaluation. 4995 Cost = std::numeric_limits<float>::max(); 4996 } 4997 4998 for (unsigned i = 2; i <= MaxVF; i *= 2) { 4999 // Notice that the vector loop needs to be executed less times, so 5000 // we need to divide the cost of the vector loops by the width of 5001 // the vector elements. 5002 VectorizationCostTy C = expectedCost(i); 5003 float VectorCost = C.first / (float)i; 5004 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5005 << " costs: " << (int)VectorCost << ".\n"); 5006 if (!C.second && !ForceVectorization) { 5007 LLVM_DEBUG( 5008 dbgs() << "LV: Not considering vector loop of width " << i 5009 << " because it will not generate any vector instructions.\n"); 5010 continue; 5011 } 5012 if (VectorCost < Cost) { 5013 Cost = VectorCost; 5014 Width = i; 5015 } 5016 } 5017 5018 if (!EnableCondStoresVectorization && NumPredStores) { 5019 reportVectorizationFailure("There are conditional stores.", 5020 "store that is conditionally executed prevents vectorization", 5021 "ConditionalStore", ORE, TheLoop); 5022 Width = 1; 5023 Cost = ScalarCost; 5024 } 5025 5026 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5027 << "LV: Vectorization seems to be not beneficial, " 5028 << "but was forced by a user.\n"); 5029 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5030 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5031 return Factor; 5032 } 5033 5034 std::pair<unsigned, unsigned> 5035 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5036 unsigned MinWidth = -1U; 5037 unsigned MaxWidth = 8; 5038 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5039 5040 // For each block. 5041 for (BasicBlock *BB : TheLoop->blocks()) { 5042 // For each instruction in the loop. 5043 for (Instruction &I : BB->instructionsWithoutDebug()) { 5044 Type *T = I.getType(); 5045 5046 // Skip ignored values. 5047 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) 5048 continue; 5049 5050 // Only examine Loads, Stores and PHINodes. 5051 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5052 continue; 5053 5054 // Examine PHI nodes that are reduction variables. Update the type to 5055 // account for the recurrence type. 5056 if (auto *PN = dyn_cast<PHINode>(&I)) { 5057 if (!Legal->isReductionVariable(PN)) 5058 continue; 5059 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; 5060 T = RdxDesc.getRecurrenceType(); 5061 } 5062 5063 // Examine the stored values. 5064 if (auto *ST = dyn_cast<StoreInst>(&I)) 5065 T = ST->getValueOperand()->getType(); 5066 5067 // Ignore loaded pointer types and stored pointer types that are not 5068 // vectorizable. 5069 // 5070 // FIXME: The check here attempts to predict whether a load or store will 5071 // be vectorized. We only know this for certain after a VF has 5072 // been selected. Here, we assume that if an access can be 5073 // vectorized, it will be. We should also look at extending this 5074 // optimization to non-pointer types. 5075 // 5076 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5077 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5078 continue; 5079 5080 MinWidth = std::min(MinWidth, 5081 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5082 MaxWidth = std::max(MaxWidth, 5083 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5084 } 5085 } 5086 5087 return {MinWidth, MaxWidth}; 5088 } 5089 5090 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5091 unsigned LoopCost) { 5092 // -- The interleave heuristics -- 5093 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5094 // There are many micro-architectural considerations that we can't predict 5095 // at this level. For example, frontend pressure (on decode or fetch) due to 5096 // code size, or the number and capabilities of the execution ports. 5097 // 5098 // We use the following heuristics to select the interleave count: 5099 // 1. If the code has reductions, then we interleave to break the cross 5100 // iteration dependency. 5101 // 2. If the loop is really small, then we interleave to reduce the loop 5102 // overhead. 5103 // 3. We don't interleave if we think that we will spill registers to memory 5104 // due to the increased register pressure. 5105 5106 if (!isScalarEpilogueAllowed()) 5107 return 1; 5108 5109 // We used the distance for the interleave count. 5110 if (Legal->getMaxSafeDepDistBytes() != -1U) 5111 return 1; 5112 5113 // Do not interleave loops with a relatively small trip count. 5114 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5115 if (TC > 1 && TC < TinyTripCountInterleaveThreshold) 5116 return 1; 5117 5118 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); 5119 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5120 << " registers\n"); 5121 5122 if (VF == 1) { 5123 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5124 TargetNumRegisters = ForceTargetNumScalarRegs; 5125 } else { 5126 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5127 TargetNumRegisters = ForceTargetNumVectorRegs; 5128 } 5129 5130 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5131 // We divide by these constants so assume that we have at least one 5132 // instruction that uses at least one register. 5133 R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); 5134 5135 // We calculate the interleave count using the following formula. 5136 // Subtract the number of loop invariants from the number of available 5137 // registers. These registers are used by all of the interleaved instances. 5138 // Next, divide the remaining registers by the number of registers that is 5139 // required by the loop, in order to estimate how many parallel instances 5140 // fit without causing spills. All of this is rounded down if necessary to be 5141 // a power of two. We want power of two interleave count to simplify any 5142 // addressing operations or alignment considerations. 5143 // We also want power of two interleave counts to ensure that the induction 5144 // variable of the vector loop wraps to zero, when tail is folded by masking; 5145 // this currently happens when OptForSize, in which case IC is set to 1 above. 5146 unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / 5147 R.MaxLocalUsers); 5148 5149 // Don't count the induction variable as interleaved. 5150 if (EnableIndVarRegisterHeur) 5151 IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / 5152 std::max(1U, (R.MaxLocalUsers - 1))); 5153 5154 // Clamp the interleave ranges to reasonable counts. 5155 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5156 5157 // Check if the user has overridden the max. 5158 if (VF == 1) { 5159 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5160 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5161 } else { 5162 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5163 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5164 } 5165 5166 // If the trip count is constant, limit the interleave count to be less than 5167 // the trip count divided by VF. 5168 if (TC > 0) { 5169 assert(TC >= VF && "VF exceeds trip count?"); 5170 if ((TC / VF) < MaxInterleaveCount) 5171 MaxInterleaveCount = (TC / VF); 5172 } 5173 5174 // If we did not calculate the cost for VF (because the user selected the VF) 5175 // then we calculate the cost of VF here. 5176 if (LoopCost == 0) 5177 LoopCost = expectedCost(VF).first; 5178 5179 assert(LoopCost && "Non-zero loop cost expected"); 5180 5181 // Clamp the calculated IC to be between the 1 and the max interleave count 5182 // that the target and trip count allows. 5183 if (IC > MaxInterleaveCount) 5184 IC = MaxInterleaveCount; 5185 else if (IC < 1) 5186 IC = 1; 5187 5188 // Interleave if we vectorized this loop and there is a reduction that could 5189 // benefit from interleaving. 5190 if (VF > 1 && !Legal->getReductionVars()->empty()) { 5191 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5192 return IC; 5193 } 5194 5195 // Note that if we've already vectorized the loop we will have done the 5196 // runtime check and so interleaving won't require further checks. 5197 bool InterleavingRequiresRuntimePointerCheck = 5198 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5199 5200 // We want to interleave small loops in order to reduce the loop overhead and 5201 // potentially expose ILP opportunities. 5202 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5203 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5204 // We assume that the cost overhead is 1 and we use the cost model 5205 // to estimate the cost of the loop and interleave until the cost of the 5206 // loop overhead is about 5% of the cost of the loop. 5207 unsigned SmallIC = 5208 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5209 5210 // Interleave until store/load ports (estimated by max interleave count) are 5211 // saturated. 5212 unsigned NumStores = Legal->getNumStores(); 5213 unsigned NumLoads = Legal->getNumLoads(); 5214 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5215 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5216 5217 // If we have a scalar reduction (vector reductions are already dealt with 5218 // by this point), we can increase the critical path length if the loop 5219 // we're interleaving is inside another loop. Limit, by default to 2, so the 5220 // critical path only gets increased by one reduction operation. 5221 if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) { 5222 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5223 SmallIC = std::min(SmallIC, F); 5224 StoresIC = std::min(StoresIC, F); 5225 LoadsIC = std::min(LoadsIC, F); 5226 } 5227 5228 if (EnableLoadStoreRuntimeInterleave && 5229 std::max(StoresIC, LoadsIC) > SmallIC) { 5230 LLVM_DEBUG( 5231 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5232 return std::max(StoresIC, LoadsIC); 5233 } 5234 5235 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5236 return SmallIC; 5237 } 5238 5239 // Interleave if this is a large loop (small loops are already dealt with by 5240 // this point) that could benefit from interleaving. 5241 bool HasReductions = !Legal->getReductionVars()->empty(); 5242 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5243 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5244 return IC; 5245 } 5246 5247 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5248 return 1; 5249 } 5250 5251 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5252 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5253 // This function calculates the register usage by measuring the highest number 5254 // of values that are alive at a single location. Obviously, this is a very 5255 // rough estimation. We scan the loop in a topological order in order and 5256 // assign a number to each instruction. We use RPO to ensure that defs are 5257 // met before their users. We assume that each instruction that has in-loop 5258 // users starts an interval. We record every time that an in-loop value is 5259 // used, so we have a list of the first and last occurrences of each 5260 // instruction. Next, we transpose this data structure into a multi map that 5261 // holds the list of intervals that *end* at a specific location. This multi 5262 // map allows us to perform a linear search. We scan the instructions linearly 5263 // and record each time that a new interval starts, by placing it in a set. 5264 // If we find this value in the multi-map then we remove it from the set. 5265 // The max register usage is the maximum size of the set. 5266 // We also search for instructions that are defined outside the loop, but are 5267 // used inside the loop. We need this number separately from the max-interval 5268 // usage number because when we unroll, loop-invariant values do not take 5269 // more register. 5270 LoopBlocksDFS DFS(TheLoop); 5271 DFS.perform(LI); 5272 5273 RegisterUsage RU; 5274 5275 // Each 'key' in the map opens a new interval. The values 5276 // of the map are the index of the 'last seen' usage of the 5277 // instruction that is the key. 5278 using IntervalMap = DenseMap<Instruction *, unsigned>; 5279 5280 // Maps instruction to its index. 5281 SmallVector<Instruction *, 64> IdxToInstr; 5282 // Marks the end of each interval. 5283 IntervalMap EndPoint; 5284 // Saves the list of instruction indices that are used in the loop. 5285 SmallPtrSet<Instruction *, 8> Ends; 5286 // Saves the list of values that are used in the loop but are 5287 // defined outside the loop, such as arguments and constants. 5288 SmallPtrSet<Value *, 8> LoopInvariants; 5289 5290 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5291 for (Instruction &I : BB->instructionsWithoutDebug()) { 5292 IdxToInstr.push_back(&I); 5293 5294 // Save the end location of each USE. 5295 for (Value *U : I.operands()) { 5296 auto *Instr = dyn_cast<Instruction>(U); 5297 5298 // Ignore non-instruction values such as arguments, constants, etc. 5299 if (!Instr) 5300 continue; 5301 5302 // If this instruction is outside the loop then record it and continue. 5303 if (!TheLoop->contains(Instr)) { 5304 LoopInvariants.insert(Instr); 5305 continue; 5306 } 5307 5308 // Overwrite previous end points. 5309 EndPoint[Instr] = IdxToInstr.size(); 5310 Ends.insert(Instr); 5311 } 5312 } 5313 } 5314 5315 // Saves the list of intervals that end with the index in 'key'. 5316 using InstrList = SmallVector<Instruction *, 2>; 5317 DenseMap<unsigned, InstrList> TransposeEnds; 5318 5319 // Transpose the EndPoints to a list of values that end at each index. 5320 for (auto &Interval : EndPoint) 5321 TransposeEnds[Interval.second].push_back(Interval.first); 5322 5323 SmallPtrSet<Instruction *, 8> OpenIntervals; 5324 5325 // Get the size of the widest register. 5326 unsigned MaxSafeDepDist = -1U; 5327 if (Legal->getMaxSafeDepDistBytes() != -1U) 5328 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5329 unsigned WidestRegister = 5330 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5331 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5332 5333 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5334 SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0); 5335 5336 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5337 5338 // A lambda that gets the register usage for the given type and VF. 5339 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5340 if (Ty->isTokenTy()) 5341 return 0U; 5342 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5343 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5344 }; 5345 5346 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5347 Instruction *I = IdxToInstr[i]; 5348 5349 // Remove all of the instructions that end at this location. 5350 InstrList &List = TransposeEnds[i]; 5351 for (Instruction *ToRemove : List) 5352 OpenIntervals.erase(ToRemove); 5353 5354 // Ignore instructions that are never used within the loop. 5355 if (Ends.find(I) == Ends.end()) 5356 continue; 5357 5358 // Skip ignored values. 5359 if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) 5360 continue; 5361 5362 // For each VF find the maximum usage of registers. 5363 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5364 if (VFs[j] == 1) { 5365 MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); 5366 continue; 5367 } 5368 collectUniformsAndScalars(VFs[j]); 5369 // Count the number of live intervals. 5370 unsigned RegUsage = 0; 5371 for (auto Inst : OpenIntervals) { 5372 // Skip ignored values for VF > 1. 5373 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() || 5374 isScalarAfterVectorization(Inst, VFs[j])) 5375 continue; 5376 RegUsage += GetRegUsage(Inst->getType(), VFs[j]); 5377 } 5378 MaxUsages[j] = std::max(MaxUsages[j], RegUsage); 5379 } 5380 5381 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5382 << OpenIntervals.size() << '\n'); 5383 5384 // Add the current instruction to the list of open intervals. 5385 OpenIntervals.insert(I); 5386 } 5387 5388 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5389 unsigned Invariant = 0; 5390 if (VFs[i] == 1) 5391 Invariant = LoopInvariants.size(); 5392 else { 5393 for (auto Inst : LoopInvariants) 5394 Invariant += GetRegUsage(Inst->getType(), VFs[i]); 5395 } 5396 5397 LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); 5398 LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); 5399 LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant 5400 << '\n'); 5401 5402 RU.LoopInvariantRegs = Invariant; 5403 RU.MaxLocalUsers = MaxUsages[i]; 5404 RUs[i] = RU; 5405 } 5406 5407 return RUs; 5408 } 5409 5410 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5411 // TODO: Cost model for emulated masked load/store is completely 5412 // broken. This hack guides the cost model to use an artificially 5413 // high enough value to practically disable vectorization with such 5414 // operations, except where previously deployed legality hack allowed 5415 // using very low cost values. This is to avoid regressions coming simply 5416 // from moving "masked load/store" check from legality to cost model. 5417 // Masked Load/Gather emulation was previously never allowed. 5418 // Limited number of Masked Store/Scatter emulation was allowed. 5419 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5420 return isa<LoadInst>(I) || 5421 (isa<StoreInst>(I) && 5422 NumPredStores > NumberOfStoresToPredicate); 5423 } 5424 5425 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5426 // If we aren't vectorizing the loop, or if we've already collected the 5427 // instructions to scalarize, there's nothing to do. Collection may already 5428 // have occurred if we have a user-selected VF and are now computing the 5429 // expected cost for interleaving. 5430 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5431 return; 5432 5433 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5434 // not profitable to scalarize any instructions, the presence of VF in the 5435 // map will indicate that we've analyzed it already. 5436 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5437 5438 // Find all the instructions that are scalar with predication in the loop and 5439 // determine if it would be better to not if-convert the blocks they are in. 5440 // If so, we also record the instructions to scalarize. 5441 for (BasicBlock *BB : TheLoop->blocks()) { 5442 if (!blockNeedsPredication(BB)) 5443 continue; 5444 for (Instruction &I : *BB) 5445 if (isScalarWithPredication(&I)) { 5446 ScalarCostsTy ScalarCosts; 5447 // Do not apply discount logic if hacked cost is needed 5448 // for emulated masked memrefs. 5449 if (!useEmulatedMaskMemRefHack(&I) && 5450 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5451 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5452 // Remember that BB will remain after vectorization. 5453 PredicatedBBsAfterVectorization.insert(BB); 5454 } 5455 } 5456 } 5457 5458 int LoopVectorizationCostModel::computePredInstDiscount( 5459 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5460 unsigned VF) { 5461 assert(!isUniformAfterVectorization(PredInst, VF) && 5462 "Instruction marked uniform-after-vectorization will be predicated"); 5463 5464 // Initialize the discount to zero, meaning that the scalar version and the 5465 // vector version cost the same. 5466 int Discount = 0; 5467 5468 // Holds instructions to analyze. The instructions we visit are mapped in 5469 // ScalarCosts. Those instructions are the ones that would be scalarized if 5470 // we find that the scalar version costs less. 5471 SmallVector<Instruction *, 8> Worklist; 5472 5473 // Returns true if the given instruction can be scalarized. 5474 auto canBeScalarized = [&](Instruction *I) -> bool { 5475 // We only attempt to scalarize instructions forming a single-use chain 5476 // from the original predicated block that would otherwise be vectorized. 5477 // Although not strictly necessary, we give up on instructions we know will 5478 // already be scalar to avoid traversing chains that are unlikely to be 5479 // beneficial. 5480 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5481 isScalarAfterVectorization(I, VF)) 5482 return false; 5483 5484 // If the instruction is scalar with predication, it will be analyzed 5485 // separately. We ignore it within the context of PredInst. 5486 if (isScalarWithPredication(I)) 5487 return false; 5488 5489 // If any of the instruction's operands are uniform after vectorization, 5490 // the instruction cannot be scalarized. This prevents, for example, a 5491 // masked load from being scalarized. 5492 // 5493 // We assume we will only emit a value for lane zero of an instruction 5494 // marked uniform after vectorization, rather than VF identical values. 5495 // Thus, if we scalarize an instruction that uses a uniform, we would 5496 // create uses of values corresponding to the lanes we aren't emitting code 5497 // for. This behavior can be changed by allowing getScalarValue to clone 5498 // the lane zero values for uniforms rather than asserting. 5499 for (Use &U : I->operands()) 5500 if (auto *J = dyn_cast<Instruction>(U.get())) 5501 if (isUniformAfterVectorization(J, VF)) 5502 return false; 5503 5504 // Otherwise, we can scalarize the instruction. 5505 return true; 5506 }; 5507 5508 // Compute the expected cost discount from scalarizing the entire expression 5509 // feeding the predicated instruction. We currently only consider expressions 5510 // that are single-use instruction chains. 5511 Worklist.push_back(PredInst); 5512 while (!Worklist.empty()) { 5513 Instruction *I = Worklist.pop_back_val(); 5514 5515 // If we've already analyzed the instruction, there's nothing to do. 5516 if (ScalarCosts.find(I) != ScalarCosts.end()) 5517 continue; 5518 5519 // Compute the cost of the vector instruction. Note that this cost already 5520 // includes the scalarization overhead of the predicated instruction. 5521 unsigned VectorCost = getInstructionCost(I, VF).first; 5522 5523 // Compute the cost of the scalarized instruction. This cost is the cost of 5524 // the instruction as if it wasn't if-converted and instead remained in the 5525 // predicated block. We will scale this cost by block probability after 5526 // computing the scalarization overhead. 5527 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5528 5529 // Compute the scalarization overhead of needed insertelement instructions 5530 // and phi nodes. 5531 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5532 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), 5533 true, false); 5534 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); 5535 } 5536 5537 // Compute the scalarization overhead of needed extractelement 5538 // instructions. For each of the instruction's operands, if the operand can 5539 // be scalarized, add it to the worklist; otherwise, account for the 5540 // overhead. 5541 for (Use &U : I->operands()) 5542 if (auto *J = dyn_cast<Instruction>(U.get())) { 5543 assert(VectorType::isValidElementType(J->getType()) && 5544 "Instruction has non-scalar type"); 5545 if (canBeScalarized(J)) 5546 Worklist.push_back(J); 5547 else if (needsExtract(J, VF)) 5548 ScalarCost += TTI.getScalarizationOverhead( 5549 ToVectorTy(J->getType(),VF), false, true); 5550 } 5551 5552 // Scale the total scalar cost by block probability. 5553 ScalarCost /= getReciprocalPredBlockProb(); 5554 5555 // Compute the discount. A non-negative discount means the vector version 5556 // of the instruction costs more, and scalarizing would be beneficial. 5557 Discount += VectorCost - ScalarCost; 5558 ScalarCosts[I] = ScalarCost; 5559 } 5560 5561 return Discount; 5562 } 5563 5564 LoopVectorizationCostModel::VectorizationCostTy 5565 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5566 VectorizationCostTy Cost; 5567 5568 // For each block. 5569 for (BasicBlock *BB : TheLoop->blocks()) { 5570 VectorizationCostTy BlockCost; 5571 5572 // For each instruction in the old loop. 5573 for (Instruction &I : BB->instructionsWithoutDebug()) { 5574 // Skip ignored values. 5575 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || 5576 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) 5577 continue; 5578 5579 VectorizationCostTy C = getInstructionCost(&I, VF); 5580 5581 // Check if we should override the cost. 5582 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5583 C.first = ForceTargetInstructionCost; 5584 5585 BlockCost.first += C.first; 5586 BlockCost.second |= C.second; 5587 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5588 << " for VF " << VF << " For instruction: " << I 5589 << '\n'); 5590 } 5591 5592 // If we are vectorizing a predicated block, it will have been 5593 // if-converted. This means that the block's instructions (aside from 5594 // stores and instructions that may divide by zero) will now be 5595 // unconditionally executed. For the scalar case, we may not always execute 5596 // the predicated block. Thus, scale the block's cost by the probability of 5597 // executing it. 5598 if (VF == 1 && blockNeedsPredication(BB)) 5599 BlockCost.first /= getReciprocalPredBlockProb(); 5600 5601 Cost.first += BlockCost.first; 5602 Cost.second |= BlockCost.second; 5603 } 5604 5605 return Cost; 5606 } 5607 5608 /// Gets Address Access SCEV after verifying that the access pattern 5609 /// is loop invariant except the induction variable dependence. 5610 /// 5611 /// This SCEV can be sent to the Target in order to estimate the address 5612 /// calculation cost. 5613 static const SCEV *getAddressAccessSCEV( 5614 Value *Ptr, 5615 LoopVectorizationLegality *Legal, 5616 PredicatedScalarEvolution &PSE, 5617 const Loop *TheLoop) { 5618 5619 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5620 if (!Gep) 5621 return nullptr; 5622 5623 // We are looking for a gep with all loop invariant indices except for one 5624 // which should be an induction variable. 5625 auto SE = PSE.getSE(); 5626 unsigned NumOperands = Gep->getNumOperands(); 5627 for (unsigned i = 1; i < NumOperands; ++i) { 5628 Value *Opd = Gep->getOperand(i); 5629 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5630 !Legal->isInductionVariable(Opd)) 5631 return nullptr; 5632 } 5633 5634 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5635 return PSE.getSCEV(Ptr); 5636 } 5637 5638 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5639 return Legal->hasStride(I->getOperand(0)) || 5640 Legal->hasStride(I->getOperand(1)); 5641 } 5642 5643 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5644 unsigned VF) { 5645 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5646 Type *ValTy = getMemInstValueType(I); 5647 auto SE = PSE.getSE(); 5648 5649 unsigned Alignment = getLoadStoreAlignment(I); 5650 unsigned AS = getLoadStoreAddressSpace(I); 5651 Value *Ptr = getLoadStorePointerOperand(I); 5652 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5653 5654 // Figure out whether the access is strided and get the stride value 5655 // if it's known in compile time 5656 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5657 5658 // Get the cost of the scalar memory instruction and address computation. 5659 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5660 5661 // Don't pass *I here, since it is scalar but will actually be part of a 5662 // vectorized loop where the user of it is a vectorized instruction. 5663 Cost += VF * 5664 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 5665 AS); 5666 5667 // Get the overhead of the extractelement and insertelement instructions 5668 // we might create due to scalarization. 5669 Cost += getScalarizationOverhead(I, VF); 5670 5671 // If we have a predicated store, it may not be executed for each vector 5672 // lane. Scale the cost by the probability of executing the predicated 5673 // block. 5674 if (isPredicatedInst(I)) { 5675 Cost /= getReciprocalPredBlockProb(); 5676 5677 if (useEmulatedMaskMemRefHack(I)) 5678 // Artificially setting to a high enough value to practically disable 5679 // vectorization with such operations. 5680 Cost = 3000000; 5681 } 5682 5683 return Cost; 5684 } 5685 5686 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5687 unsigned VF) { 5688 Type *ValTy = getMemInstValueType(I); 5689 Type *VectorTy = ToVectorTy(ValTy, VF); 5690 unsigned Alignment = getLoadStoreAlignment(I); 5691 Value *Ptr = getLoadStorePointerOperand(I); 5692 unsigned AS = getLoadStoreAddressSpace(I); 5693 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5694 5695 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5696 "Stride should be 1 or -1 for consecutive memory access"); 5697 unsigned Cost = 0; 5698 if (Legal->isMaskRequired(I)) 5699 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 5700 else 5701 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); 5702 5703 bool Reverse = ConsecutiveStride < 0; 5704 if (Reverse) 5705 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5706 return Cost; 5707 } 5708 5709 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5710 unsigned VF) { 5711 Type *ValTy = getMemInstValueType(I); 5712 Type *VectorTy = ToVectorTy(ValTy, VF); 5713 unsigned Alignment = getLoadStoreAlignment(I); 5714 unsigned AS = getLoadStoreAddressSpace(I); 5715 if (isa<LoadInst>(I)) { 5716 return TTI.getAddressComputationCost(ValTy) + 5717 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + 5718 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5719 } 5720 StoreInst *SI = cast<StoreInst>(I); 5721 5722 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5723 return TTI.getAddressComputationCost(ValTy) + 5724 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + 5725 (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost( 5726 Instruction::ExtractElement, 5727 VectorTy, VF - 1)); 5728 } 5729 5730 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5731 unsigned VF) { 5732 Type *ValTy = getMemInstValueType(I); 5733 Type *VectorTy = ToVectorTy(ValTy, VF); 5734 unsigned Alignment = getLoadStoreAlignment(I); 5735 Value *Ptr = getLoadStorePointerOperand(I); 5736 5737 return TTI.getAddressComputationCost(VectorTy) + 5738 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5739 Legal->isMaskRequired(I), Alignment); 5740 } 5741 5742 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5743 unsigned VF) { 5744 Type *ValTy = getMemInstValueType(I); 5745 Type *VectorTy = ToVectorTy(ValTy, VF); 5746 unsigned AS = getLoadStoreAddressSpace(I); 5747 5748 auto Group = getInterleavedAccessGroup(I); 5749 assert(Group && "Fail to get an interleaved access group."); 5750 5751 unsigned InterleaveFactor = Group->getFactor(); 5752 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5753 5754 // Holds the indices of existing members in an interleaved load group. 5755 // An interleaved store group doesn't need this as it doesn't allow gaps. 5756 SmallVector<unsigned, 4> Indices; 5757 if (isa<LoadInst>(I)) { 5758 for (unsigned i = 0; i < InterleaveFactor; i++) 5759 if (Group->getMember(i)) 5760 Indices.push_back(i); 5761 } 5762 5763 // Calculate the cost of the whole interleaved group. 5764 bool UseMaskForGaps = 5765 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5766 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5767 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5768 Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); 5769 5770 if (Group->isReverse()) { 5771 // TODO: Add support for reversed masked interleaved access. 5772 assert(!Legal->isMaskRequired(I) && 5773 "Reverse masked interleaved access not supported."); 5774 Cost += Group->getNumMembers() * 5775 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5776 } 5777 return Cost; 5778 } 5779 5780 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5781 unsigned VF) { 5782 // Calculate scalar cost only. Vectorization cost should be ready at this 5783 // moment. 5784 if (VF == 1) { 5785 Type *ValTy = getMemInstValueType(I); 5786 unsigned Alignment = getLoadStoreAlignment(I); 5787 unsigned AS = getLoadStoreAddressSpace(I); 5788 5789 return TTI.getAddressComputationCost(ValTy) + 5790 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); 5791 } 5792 return getWideningCost(I, VF); 5793 } 5794 5795 LoopVectorizationCostModel::VectorizationCostTy 5796 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5797 // If we know that this instruction will remain uniform, check the cost of 5798 // the scalar version. 5799 if (isUniformAfterVectorization(I, VF)) 5800 VF = 1; 5801 5802 if (VF > 1 && isProfitableToScalarize(I, VF)) 5803 return VectorizationCostTy(InstsToScalarize[VF][I], false); 5804 5805 // Forced scalars do not have any scalarization overhead. 5806 auto ForcedScalar = ForcedScalars.find(VF); 5807 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 5808 auto InstSet = ForcedScalar->second; 5809 if (InstSet.find(I) != InstSet.end()) 5810 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 5811 } 5812 5813 Type *VectorTy; 5814 unsigned C = getInstructionCost(I, VF, VectorTy); 5815 5816 bool TypeNotScalarized = 5817 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 5818 return VectorizationCostTy(C, TypeNotScalarized); 5819 } 5820 5821 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 5822 unsigned VF) { 5823 5824 if (VF == 1) 5825 return 0; 5826 5827 unsigned Cost = 0; 5828 Type *RetTy = ToVectorTy(I->getType(), VF); 5829 if (!RetTy->isVoidTy() && 5830 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 5831 Cost += TTI.getScalarizationOverhead(RetTy, true, false); 5832 5833 // Some targets keep addresses scalar. 5834 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 5835 return Cost; 5836 5837 // Some targets support efficient element stores. 5838 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 5839 return Cost; 5840 5841 // Collect operands to consider. 5842 CallInst *CI = dyn_cast<CallInst>(I); 5843 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 5844 5845 // Skip operands that do not require extraction/scalarization and do not incur 5846 // any overhead. 5847 return Cost + TTI.getOperandsScalarizationOverhead( 5848 filterExtractingOperands(Ops, VF), VF); 5849 } 5850 5851 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 5852 if (VF == 1) 5853 return; 5854 NumPredStores = 0; 5855 for (BasicBlock *BB : TheLoop->blocks()) { 5856 // For each instruction in the old loop. 5857 for (Instruction &I : *BB) { 5858 Value *Ptr = getLoadStorePointerOperand(&I); 5859 if (!Ptr) 5860 continue; 5861 5862 // TODO: We should generate better code and update the cost model for 5863 // predicated uniform stores. Today they are treated as any other 5864 // predicated store (see added test cases in 5865 // invariant-store-vectorization.ll). 5866 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 5867 NumPredStores++; 5868 5869 if (Legal->isUniform(Ptr) && 5870 // Conditional loads and stores should be scalarized and predicated. 5871 // isScalarWithPredication cannot be used here since masked 5872 // gather/scatters are not considered scalar with predication. 5873 !Legal->blockNeedsPredication(I.getParent())) { 5874 // TODO: Avoid replicating loads and stores instead of 5875 // relying on instcombine to remove them. 5876 // Load: Scalar load + broadcast 5877 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 5878 unsigned Cost = getUniformMemOpCost(&I, VF); 5879 setWideningDecision(&I, VF, CM_Scalarize, Cost); 5880 continue; 5881 } 5882 5883 // We assume that widening is the best solution when possible. 5884 if (memoryInstructionCanBeWidened(&I, VF)) { 5885 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 5886 int ConsecutiveStride = 5887 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 5888 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5889 "Expected consecutive stride."); 5890 InstWidening Decision = 5891 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 5892 setWideningDecision(&I, VF, Decision, Cost); 5893 continue; 5894 } 5895 5896 // Choose between Interleaving, Gather/Scatter or Scalarization. 5897 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 5898 unsigned NumAccesses = 1; 5899 if (isAccessInterleaved(&I)) { 5900 auto Group = getInterleavedAccessGroup(&I); 5901 assert(Group && "Fail to get an interleaved access group."); 5902 5903 // Make one decision for the whole group. 5904 if (getWideningDecision(&I, VF) != CM_Unknown) 5905 continue; 5906 5907 NumAccesses = Group->getNumMembers(); 5908 if (interleavedAccessCanBeWidened(&I, VF)) 5909 InterleaveCost = getInterleaveGroupCost(&I, VF); 5910 } 5911 5912 unsigned GatherScatterCost = 5913 isLegalGatherOrScatter(&I) 5914 ? getGatherScatterCost(&I, VF) * NumAccesses 5915 : std::numeric_limits<unsigned>::max(); 5916 5917 unsigned ScalarizationCost = 5918 getMemInstScalarizationCost(&I, VF) * NumAccesses; 5919 5920 // Choose better solution for the current VF, 5921 // write down this decision and use it during vectorization. 5922 unsigned Cost; 5923 InstWidening Decision; 5924 if (InterleaveCost <= GatherScatterCost && 5925 InterleaveCost < ScalarizationCost) { 5926 Decision = CM_Interleave; 5927 Cost = InterleaveCost; 5928 } else if (GatherScatterCost < ScalarizationCost) { 5929 Decision = CM_GatherScatter; 5930 Cost = GatherScatterCost; 5931 } else { 5932 Decision = CM_Scalarize; 5933 Cost = ScalarizationCost; 5934 } 5935 // If the instructions belongs to an interleave group, the whole group 5936 // receives the same decision. The whole group receives the cost, but 5937 // the cost will actually be assigned to one instruction. 5938 if (auto Group = getInterleavedAccessGroup(&I)) 5939 setWideningDecision(Group, VF, Decision, Cost); 5940 else 5941 setWideningDecision(&I, VF, Decision, Cost); 5942 } 5943 } 5944 5945 // Make sure that any load of address and any other address computation 5946 // remains scalar unless there is gather/scatter support. This avoids 5947 // inevitable extracts into address registers, and also has the benefit of 5948 // activating LSR more, since that pass can't optimize vectorized 5949 // addresses. 5950 if (TTI.prefersVectorizedAddressing()) 5951 return; 5952 5953 // Start with all scalar pointer uses. 5954 SmallPtrSet<Instruction *, 8> AddrDefs; 5955 for (BasicBlock *BB : TheLoop->blocks()) 5956 for (Instruction &I : *BB) { 5957 Instruction *PtrDef = 5958 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 5959 if (PtrDef && TheLoop->contains(PtrDef) && 5960 getWideningDecision(&I, VF) != CM_GatherScatter) 5961 AddrDefs.insert(PtrDef); 5962 } 5963 5964 // Add all instructions used to generate the addresses. 5965 SmallVector<Instruction *, 4> Worklist; 5966 for (auto *I : AddrDefs) 5967 Worklist.push_back(I); 5968 while (!Worklist.empty()) { 5969 Instruction *I = Worklist.pop_back_val(); 5970 for (auto &Op : I->operands()) 5971 if (auto *InstOp = dyn_cast<Instruction>(Op)) 5972 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 5973 AddrDefs.insert(InstOp).second) 5974 Worklist.push_back(InstOp); 5975 } 5976 5977 for (auto *I : AddrDefs) { 5978 if (isa<LoadInst>(I)) { 5979 // Setting the desired widening decision should ideally be handled in 5980 // by cost functions, but since this involves the task of finding out 5981 // if the loaded register is involved in an address computation, it is 5982 // instead changed here when we know this is the case. 5983 InstWidening Decision = getWideningDecision(I, VF); 5984 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 5985 // Scalarize a widened load of address. 5986 setWideningDecision(I, VF, CM_Scalarize, 5987 (VF * getMemoryInstructionCost(I, 1))); 5988 else if (auto Group = getInterleavedAccessGroup(I)) { 5989 // Scalarize an interleave group of address loads. 5990 for (unsigned I = 0; I < Group->getFactor(); ++I) { 5991 if (Instruction *Member = Group->getMember(I)) 5992 setWideningDecision(Member, VF, CM_Scalarize, 5993 (VF * getMemoryInstructionCost(Member, 1))); 5994 } 5995 } 5996 } else 5997 // Make sure I gets scalarized and a cost estimate without 5998 // scalarization overhead. 5999 ForcedScalars[VF].insert(I); 6000 } 6001 } 6002 6003 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6004 unsigned VF, 6005 Type *&VectorTy) { 6006 Type *RetTy = I->getType(); 6007 if (canTruncateToMinimalBitwidth(I, VF)) 6008 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6009 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6010 auto SE = PSE.getSE(); 6011 6012 // TODO: We need to estimate the cost of intrinsic calls. 6013 switch (I->getOpcode()) { 6014 case Instruction::GetElementPtr: 6015 // We mark this instruction as zero-cost because the cost of GEPs in 6016 // vectorized code depends on whether the corresponding memory instruction 6017 // is scalarized or not. Therefore, we handle GEPs with the memory 6018 // instruction cost. 6019 return 0; 6020 case Instruction::Br: { 6021 // In cases of scalarized and predicated instructions, there will be VF 6022 // predicated blocks in the vectorized loop. Each branch around these 6023 // blocks requires also an extract of its vector compare i1 element. 6024 bool ScalarPredicatedBB = false; 6025 BranchInst *BI = cast<BranchInst>(I); 6026 if (VF > 1 && BI->isConditional() && 6027 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != 6028 PredicatedBBsAfterVectorization.end() || 6029 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != 6030 PredicatedBBsAfterVectorization.end())) 6031 ScalarPredicatedBB = true; 6032 6033 if (ScalarPredicatedBB) { 6034 // Return cost for branches around scalarized and predicated blocks. 6035 Type *Vec_i1Ty = 6036 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6037 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + 6038 (TTI.getCFInstrCost(Instruction::Br) * VF)); 6039 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6040 // The back-edge branch will remain, as will all scalar branches. 6041 return TTI.getCFInstrCost(Instruction::Br); 6042 else 6043 // This branch will be eliminated by if-conversion. 6044 return 0; 6045 // Note: We currently assume zero cost for an unconditional branch inside 6046 // a predicated block since it will become a fall-through, although we 6047 // may decide in the future to call TTI for all branches. 6048 } 6049 case Instruction::PHI: { 6050 auto *Phi = cast<PHINode>(I); 6051 6052 // First-order recurrences are replaced by vector shuffles inside the loop. 6053 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6054 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6055 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6056 VectorTy, VF - 1, VectorType::get(RetTy, 1)); 6057 6058 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6059 // converted into select instructions. We require N - 1 selects per phi 6060 // node, where N is the number of incoming values. 6061 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6062 return (Phi->getNumIncomingValues() - 1) * 6063 TTI.getCmpSelInstrCost( 6064 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6065 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); 6066 6067 return TTI.getCFInstrCost(Instruction::PHI); 6068 } 6069 case Instruction::UDiv: 6070 case Instruction::SDiv: 6071 case Instruction::URem: 6072 case Instruction::SRem: 6073 // If we have a predicated instruction, it may not be executed for each 6074 // vector lane. Get the scalarization cost and scale this amount by the 6075 // probability of executing the predicated block. If the instruction is not 6076 // predicated, we fall through to the next case. 6077 if (VF > 1 && isScalarWithPredication(I)) { 6078 unsigned Cost = 0; 6079 6080 // These instructions have a non-void type, so account for the phi nodes 6081 // that we will create. This cost is likely to be zero. The phi node 6082 // cost, if any, should be scaled by the block probability because it 6083 // models a copy at the end of each predicated block. 6084 Cost += VF * TTI.getCFInstrCost(Instruction::PHI); 6085 6086 // The cost of the non-predicated instruction. 6087 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); 6088 6089 // The cost of insertelement and extractelement instructions needed for 6090 // scalarization. 6091 Cost += getScalarizationOverhead(I, VF); 6092 6093 // Scale the cost by the probability of executing the predicated blocks. 6094 // This assumes the predicated block for each vector lane is equally 6095 // likely. 6096 return Cost / getReciprocalPredBlockProb(); 6097 } 6098 LLVM_FALLTHROUGH; 6099 case Instruction::Add: 6100 case Instruction::FAdd: 6101 case Instruction::Sub: 6102 case Instruction::FSub: 6103 case Instruction::Mul: 6104 case Instruction::FMul: 6105 case Instruction::FDiv: 6106 case Instruction::FRem: 6107 case Instruction::Shl: 6108 case Instruction::LShr: 6109 case Instruction::AShr: 6110 case Instruction::And: 6111 case Instruction::Or: 6112 case Instruction::Xor: { 6113 // Since we will replace the stride by 1 the multiplication should go away. 6114 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6115 return 0; 6116 // Certain instructions can be cheaper to vectorize if they have a constant 6117 // second vector operand. One example of this are shifts on x86. 6118 Value *Op2 = I->getOperand(1); 6119 TargetTransformInfo::OperandValueProperties Op2VP; 6120 TargetTransformInfo::OperandValueKind Op2VK = 6121 TTI.getOperandInfo(Op2, Op2VP); 6122 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6123 Op2VK = TargetTransformInfo::OK_UniformValue; 6124 6125 SmallVector<const Value *, 4> Operands(I->operand_values()); 6126 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6127 return N * TTI.getArithmeticInstrCost( 6128 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6129 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands); 6130 } 6131 case Instruction::FNeg: { 6132 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6133 return N * TTI.getArithmeticInstrCost( 6134 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6135 TargetTransformInfo::OK_AnyValue, 6136 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6137 I->getOperand(0)); 6138 } 6139 case Instruction::Select: { 6140 SelectInst *SI = cast<SelectInst>(I); 6141 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6142 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6143 Type *CondTy = SI->getCondition()->getType(); 6144 if (!ScalarCond) 6145 CondTy = VectorType::get(CondTy, VF); 6146 6147 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); 6148 } 6149 case Instruction::ICmp: 6150 case Instruction::FCmp: { 6151 Type *ValTy = I->getOperand(0)->getType(); 6152 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6153 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6154 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6155 VectorTy = ToVectorTy(ValTy, VF); 6156 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); 6157 } 6158 case Instruction::Store: 6159 case Instruction::Load: { 6160 unsigned Width = VF; 6161 if (Width > 1) { 6162 InstWidening Decision = getWideningDecision(I, Width); 6163 assert(Decision != CM_Unknown && 6164 "CM decision should be taken at this point"); 6165 if (Decision == CM_Scalarize) 6166 Width = 1; 6167 } 6168 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6169 return getMemoryInstructionCost(I, VF); 6170 } 6171 case Instruction::ZExt: 6172 case Instruction::SExt: 6173 case Instruction::FPToUI: 6174 case Instruction::FPToSI: 6175 case Instruction::FPExt: 6176 case Instruction::PtrToInt: 6177 case Instruction::IntToPtr: 6178 case Instruction::SIToFP: 6179 case Instruction::UIToFP: 6180 case Instruction::Trunc: 6181 case Instruction::FPTrunc: 6182 case Instruction::BitCast: { 6183 // We optimize the truncation of induction variables having constant 6184 // integer steps. The cost of these truncations is the same as the scalar 6185 // operation. 6186 if (isOptimizableIVTruncate(I, VF)) { 6187 auto *Trunc = cast<TruncInst>(I); 6188 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6189 Trunc->getSrcTy(), Trunc); 6190 } 6191 6192 Type *SrcScalarTy = I->getOperand(0)->getType(); 6193 Type *SrcVecTy = 6194 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6195 if (canTruncateToMinimalBitwidth(I, VF)) { 6196 // This cast is going to be shrunk. This may remove the cast or it might 6197 // turn it into slightly different cast. For example, if MinBW == 16, 6198 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6199 // 6200 // Calculate the modified src and dest types. 6201 Type *MinVecTy = VectorTy; 6202 if (I->getOpcode() == Instruction::Trunc) { 6203 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6204 VectorTy = 6205 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6206 } else if (I->getOpcode() == Instruction::ZExt || 6207 I->getOpcode() == Instruction::SExt) { 6208 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6209 VectorTy = 6210 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6211 } 6212 } 6213 6214 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6215 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); 6216 } 6217 case Instruction::Call: { 6218 bool NeedToScalarize; 6219 CallInst *CI = cast<CallInst>(I); 6220 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6221 if (getVectorIntrinsicIDForCall(CI, TLI)) 6222 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6223 return CallCost; 6224 } 6225 default: 6226 // The cost of executing VF copies of the scalar instruction. This opcode 6227 // is unknown. Assume that it is the same as 'mul'. 6228 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + 6229 getScalarizationOverhead(I, VF); 6230 } // end of switch. 6231 } 6232 6233 char LoopVectorize::ID = 0; 6234 6235 static const char lv_name[] = "Loop Vectorization"; 6236 6237 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6238 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6239 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6240 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6241 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6242 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6243 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6244 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6245 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6246 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6247 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6248 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6249 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6250 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6251 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6252 6253 namespace llvm { 6254 6255 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6256 6257 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6258 bool VectorizeOnlyWhenForced) { 6259 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6260 } 6261 6262 } // end namespace llvm 6263 6264 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6265 // Check if the pointer operand of a load or store instruction is 6266 // consecutive. 6267 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6268 return Legal->isConsecutivePtr(Ptr); 6269 return false; 6270 } 6271 6272 void LoopVectorizationCostModel::collectValuesToIgnore() { 6273 // Ignore ephemeral values. 6274 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6275 6276 // Ignore type-promoting instructions we identified during reduction 6277 // detection. 6278 for (auto &Reduction : *Legal->getReductionVars()) { 6279 RecurrenceDescriptor &RedDes = Reduction.second; 6280 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6281 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6282 } 6283 // Ignore type-casting instructions we identified during induction 6284 // detection. 6285 for (auto &Induction : *Legal->getInductionVars()) { 6286 InductionDescriptor &IndDes = Induction.second; 6287 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6288 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6289 } 6290 } 6291 6292 // TODO: we could return a pair of values that specify the max VF and 6293 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6294 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6295 // doesn't have a cost model that can choose which plan to execute if 6296 // more than one is generated. 6297 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6298 LoopVectorizationCostModel &CM) { 6299 unsigned WidestType; 6300 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6301 return WidestVectorRegBits / WidestType; 6302 } 6303 6304 VectorizationFactor 6305 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6306 unsigned VF = UserVF; 6307 // Outer loop handling: They may require CFG and instruction level 6308 // transformations before even evaluating whether vectorization is profitable. 6309 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6310 // the vectorization pipeline. 6311 if (!OrigLoop->empty()) { 6312 // If the user doesn't provide a vectorization factor, determine a 6313 // reasonable one. 6314 if (!UserVF) { 6315 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6316 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6317 6318 // Make sure we have a VF > 1 for stress testing. 6319 if (VPlanBuildStressTest && VF < 2) { 6320 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6321 << "overriding computed VF.\n"); 6322 VF = 4; 6323 } 6324 } 6325 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6326 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6327 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6328 << " to build VPlans.\n"); 6329 buildVPlans(VF, VF); 6330 6331 // For VPlan build stress testing, we bail out after VPlan construction. 6332 if (VPlanBuildStressTest) 6333 return VectorizationFactor::Disabled(); 6334 6335 return {VF, 0}; 6336 } 6337 6338 LLVM_DEBUG( 6339 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6340 "VPlan-native path.\n"); 6341 return VectorizationFactor::Disabled(); 6342 } 6343 6344 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { 6345 assert(OrigLoop->empty() && "Inner loop expected."); 6346 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); 6347 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6348 return None; 6349 6350 // Invalidate interleave groups if all blocks of loop will be predicated. 6351 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6352 !useMaskedInterleavedAccesses(*TTI)) { 6353 LLVM_DEBUG( 6354 dbgs() 6355 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6356 "which requires masked-interleaved support.\n"); 6357 CM.InterleaveInfo.reset(); 6358 } 6359 6360 if (UserVF) { 6361 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6362 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6363 // Collect the instructions (and their associated costs) that will be more 6364 // profitable to scalarize. 6365 CM.selectUserVectorizationFactor(UserVF); 6366 buildVPlansWithVPRecipes(UserVF, UserVF); 6367 LLVM_DEBUG(printPlans(dbgs())); 6368 return {{UserVF, 0}}; 6369 } 6370 6371 unsigned MaxVF = MaybeMaxVF.getValue(); 6372 assert(MaxVF != 0 && "MaxVF is zero."); 6373 6374 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6375 // Collect Uniform and Scalar instructions after vectorization with VF. 6376 CM.collectUniformsAndScalars(VF); 6377 6378 // Collect the instructions (and their associated costs) that will be more 6379 // profitable to scalarize. 6380 if (VF > 1) 6381 CM.collectInstsToScalarize(VF); 6382 } 6383 6384 buildVPlansWithVPRecipes(1, MaxVF); 6385 LLVM_DEBUG(printPlans(dbgs())); 6386 if (MaxVF == 1) 6387 return VectorizationFactor::Disabled(); 6388 6389 // Select the optimal vectorization factor. 6390 return CM.selectVectorizationFactor(MaxVF); 6391 } 6392 6393 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6394 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6395 << '\n'); 6396 BestVF = VF; 6397 BestUF = UF; 6398 6399 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6400 return !Plan->hasVF(VF); 6401 }); 6402 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6403 } 6404 6405 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6406 DominatorTree *DT) { 6407 // Perform the actual loop transformation. 6408 6409 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6410 VPCallbackILV CallbackILV(ILV); 6411 6412 VPTransformState State{BestVF, BestUF, LI, 6413 DT, ILV.Builder, ILV.VectorLoopValueMap, 6414 &ILV, CallbackILV}; 6415 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6416 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6417 6418 //===------------------------------------------------===// 6419 // 6420 // Notice: any optimization or new instruction that go 6421 // into the code below should also be implemented in 6422 // the cost-model. 6423 // 6424 //===------------------------------------------------===// 6425 6426 // 2. Copy and widen instructions from the old loop into the new loop. 6427 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6428 VPlans.front()->execute(&State); 6429 6430 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6431 // predication, updating analyses. 6432 ILV.fixVectorizedLoop(); 6433 } 6434 6435 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6436 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6437 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6438 6439 // We create new control-flow for the vectorized loop, so the original 6440 // condition will be dead after vectorization if it's only used by the 6441 // branch. 6442 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6443 if (Cmp && Cmp->hasOneUse()) 6444 DeadInstructions.insert(Cmp); 6445 6446 // We create new "steps" for induction variable updates to which the original 6447 // induction variables map. An original update instruction will be dead if 6448 // all its users except the induction variable are dead. 6449 for (auto &Induction : *Legal->getInductionVars()) { 6450 PHINode *Ind = Induction.first; 6451 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6452 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6453 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != 6454 DeadInstructions.end(); 6455 })) 6456 DeadInstructions.insert(IndUpdate); 6457 6458 // We record as "Dead" also the type-casting instructions we had identified 6459 // during induction analysis. We don't need any handling for them in the 6460 // vectorized loop because we have proven that, under a proper runtime 6461 // test guarding the vectorized loop, the value of the phi, and the casted 6462 // value of the phi, are the same. The last instruction in this casting chain 6463 // will get its scalar/vector/widened def from the scalar/vector/widened def 6464 // of the respective phi node. Any other casts in the induction def-use chain 6465 // have no other uses outside the phi update chain, and will be ignored. 6466 InductionDescriptor &IndDes = Induction.second; 6467 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6468 DeadInstructions.insert(Casts.begin(), Casts.end()); 6469 } 6470 } 6471 6472 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6473 6474 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6475 6476 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6477 Instruction::BinaryOps BinOp) { 6478 // When unrolling and the VF is 1, we only need to add a simple scalar. 6479 Type *Ty = Val->getType(); 6480 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6481 6482 if (Ty->isFloatingPointTy()) { 6483 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6484 6485 // Floating point operations had to be 'fast' to enable the unrolling. 6486 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6487 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6488 } 6489 Constant *C = ConstantInt::get(Ty, StartIdx); 6490 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6491 } 6492 6493 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6494 SmallVector<Metadata *, 4> MDs; 6495 // Reserve first location for self reference to the LoopID metadata node. 6496 MDs.push_back(nullptr); 6497 bool IsUnrollMetadata = false; 6498 MDNode *LoopID = L->getLoopID(); 6499 if (LoopID) { 6500 // First find existing loop unrolling disable metadata. 6501 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6502 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6503 if (MD) { 6504 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6505 IsUnrollMetadata = 6506 S && S->getString().startswith("llvm.loop.unroll.disable"); 6507 } 6508 MDs.push_back(LoopID->getOperand(i)); 6509 } 6510 } 6511 6512 if (!IsUnrollMetadata) { 6513 // Add runtime unroll disable metadata. 6514 LLVMContext &Context = L->getHeader()->getContext(); 6515 SmallVector<Metadata *, 1> DisableOperands; 6516 DisableOperands.push_back( 6517 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6518 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6519 MDs.push_back(DisableNode); 6520 MDNode *NewLoopID = MDNode::get(Context, MDs); 6521 // Set operand 0 to refer to the loop id itself. 6522 NewLoopID->replaceOperandWith(0, NewLoopID); 6523 L->setLoopID(NewLoopID); 6524 } 6525 } 6526 6527 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6528 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6529 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6530 bool PredicateAtRangeStart = Predicate(Range.Start); 6531 6532 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6533 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6534 Range.End = TmpVF; 6535 break; 6536 } 6537 6538 return PredicateAtRangeStart; 6539 } 6540 6541 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6542 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6543 /// of VF's starting at a given VF and extending it as much as possible. Each 6544 /// vectorization decision can potentially shorten this sub-range during 6545 /// buildVPlan(). 6546 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6547 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6548 VFRange SubRange = {VF, MaxVF + 1}; 6549 VPlans.push_back(buildVPlan(SubRange)); 6550 VF = SubRange.End; 6551 } 6552 } 6553 6554 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6555 VPlanPtr &Plan) { 6556 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6557 6558 // Look for cached value. 6559 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6560 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6561 if (ECEntryIt != EdgeMaskCache.end()) 6562 return ECEntryIt->second; 6563 6564 VPValue *SrcMask = createBlockInMask(Src, Plan); 6565 6566 // The terminator has to be a branch inst! 6567 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6568 assert(BI && "Unexpected terminator found"); 6569 6570 if (!BI->isConditional()) 6571 return EdgeMaskCache[Edge] = SrcMask; 6572 6573 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6574 assert(EdgeMask && "No Edge Mask found for condition"); 6575 6576 if (BI->getSuccessor(0) != Dst) 6577 EdgeMask = Builder.createNot(EdgeMask); 6578 6579 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6580 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6581 6582 return EdgeMaskCache[Edge] = EdgeMask; 6583 } 6584 6585 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6586 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6587 6588 // Look for cached value. 6589 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6590 if (BCEntryIt != BlockMaskCache.end()) 6591 return BCEntryIt->second; 6592 6593 // All-one mask is modelled as no-mask following the convention for masked 6594 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6595 VPValue *BlockMask = nullptr; 6596 6597 if (OrigLoop->getHeader() == BB) { 6598 if (!CM.blockNeedsPredication(BB)) 6599 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6600 6601 // Introduce the early-exit compare IV <= BTC to form header block mask. 6602 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6603 VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6604 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6605 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6606 return BlockMaskCache[BB] = BlockMask; 6607 } 6608 6609 // This is the block mask. We OR all incoming edges. 6610 for (auto *Predecessor : predecessors(BB)) { 6611 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6612 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6613 return BlockMaskCache[BB] = EdgeMask; 6614 6615 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6616 BlockMask = EdgeMask; 6617 continue; 6618 } 6619 6620 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6621 } 6622 6623 return BlockMaskCache[BB] = BlockMask; 6624 } 6625 6626 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, 6627 VFRange &Range, 6628 VPlanPtr &Plan) { 6629 const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I); 6630 if (!IG) 6631 return nullptr; 6632 6633 // Now check if IG is relevant for VF's in the given range. 6634 auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> { 6635 return [=](unsigned VF) -> bool { 6636 return (VF >= 2 && // Query is illegal for VF == 1 6637 CM.getWideningDecision(I, VF) == 6638 LoopVectorizationCostModel::CM_Interleave); 6639 }; 6640 }; 6641 if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range)) 6642 return nullptr; 6643 6644 // I is a member of an InterleaveGroup for VF's in the (possibly trimmed) 6645 // range. If it's the primary member of the IG construct a VPInterleaveRecipe. 6646 // Otherwise, it's an adjunct member of the IG, do not construct any Recipe. 6647 assert(I == IG->getInsertPos() && 6648 "Generating a recipe for an adjunct member of an interleave group"); 6649 6650 VPValue *Mask = nullptr; 6651 if (Legal->isMaskRequired(I)) 6652 Mask = createBlockInMask(I->getParent(), Plan); 6653 6654 return new VPInterleaveRecipe(IG, Mask); 6655 } 6656 6657 VPWidenMemoryInstructionRecipe * 6658 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6659 VPlanPtr &Plan) { 6660 if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) 6661 return nullptr; 6662 6663 auto willWiden = [&](unsigned VF) -> bool { 6664 if (VF == 1) 6665 return false; 6666 if (CM.isScalarAfterVectorization(I, VF) || 6667 CM.isProfitableToScalarize(I, VF)) 6668 return false; 6669 LoopVectorizationCostModel::InstWidening Decision = 6670 CM.getWideningDecision(I, VF); 6671 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6672 "CM decision should be taken at this point."); 6673 assert(Decision != LoopVectorizationCostModel::CM_Interleave && 6674 "Interleave memory opportunity should be caught earlier."); 6675 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6676 }; 6677 6678 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6679 return nullptr; 6680 6681 VPValue *Mask = nullptr; 6682 if (Legal->isMaskRequired(I)) 6683 Mask = createBlockInMask(I->getParent(), Plan); 6684 6685 return new VPWidenMemoryInstructionRecipe(*I, Mask); 6686 } 6687 6688 VPWidenIntOrFpInductionRecipe * 6689 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { 6690 if (PHINode *Phi = dyn_cast<PHINode>(I)) { 6691 // Check if this is an integer or fp induction. If so, build the recipe that 6692 // produces its scalar and vector values. 6693 InductionDescriptor II = Legal->getInductionVars()->lookup(Phi); 6694 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6695 II.getKind() == InductionDescriptor::IK_FpInduction) 6696 return new VPWidenIntOrFpInductionRecipe(Phi); 6697 6698 return nullptr; 6699 } 6700 6701 // Optimize the special case where the source is a constant integer 6702 // induction variable. Notice that we can only optimize the 'trunc' case 6703 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6704 // (c) other casts depend on pointer size. 6705 6706 // Determine whether \p K is a truncation based on an induction variable that 6707 // can be optimized. 6708 auto isOptimizableIVTruncate = 6709 [&](Instruction *K) -> std::function<bool(unsigned)> { 6710 return 6711 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6712 }; 6713 6714 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( 6715 isOptimizableIVTruncate(I), Range)) 6716 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6717 cast<TruncInst>(I)); 6718 return nullptr; 6719 } 6720 6721 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { 6722 PHINode *Phi = dyn_cast<PHINode>(I); 6723 if (!Phi || Phi->getParent() == OrigLoop->getHeader()) 6724 return nullptr; 6725 6726 // We know that all PHIs in non-header blocks are converted into selects, so 6727 // we don't have to worry about the insertion order and we can just use the 6728 // builder. At this point we generate the predication tree. There may be 6729 // duplications since this is a simple recursive scan, but future 6730 // optimizations will clean it up. 6731 6732 SmallVector<VPValue *, 2> Masks; 6733 unsigned NumIncoming = Phi->getNumIncomingValues(); 6734 for (unsigned In = 0; In < NumIncoming; In++) { 6735 VPValue *EdgeMask = 6736 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6737 assert((EdgeMask || NumIncoming == 1) && 6738 "Multiple predecessors with one having a full mask"); 6739 if (EdgeMask) 6740 Masks.push_back(EdgeMask); 6741 } 6742 return new VPBlendRecipe(Phi, Masks); 6743 } 6744 6745 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, 6746 VFRange &Range) { 6747 6748 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6749 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6750 6751 if (IsPredicated) 6752 return false; 6753 6754 auto IsVectorizableOpcode = [](unsigned Opcode) { 6755 switch (Opcode) { 6756 case Instruction::Add: 6757 case Instruction::And: 6758 case Instruction::AShr: 6759 case Instruction::BitCast: 6760 case Instruction::Br: 6761 case Instruction::Call: 6762 case Instruction::FAdd: 6763 case Instruction::FCmp: 6764 case Instruction::FDiv: 6765 case Instruction::FMul: 6766 case Instruction::FNeg: 6767 case Instruction::FPExt: 6768 case Instruction::FPToSI: 6769 case Instruction::FPToUI: 6770 case Instruction::FPTrunc: 6771 case Instruction::FRem: 6772 case Instruction::FSub: 6773 case Instruction::GetElementPtr: 6774 case Instruction::ICmp: 6775 case Instruction::IntToPtr: 6776 case Instruction::Load: 6777 case Instruction::LShr: 6778 case Instruction::Mul: 6779 case Instruction::Or: 6780 case Instruction::PHI: 6781 case Instruction::PtrToInt: 6782 case Instruction::SDiv: 6783 case Instruction::Select: 6784 case Instruction::SExt: 6785 case Instruction::Shl: 6786 case Instruction::SIToFP: 6787 case Instruction::SRem: 6788 case Instruction::Store: 6789 case Instruction::Sub: 6790 case Instruction::Trunc: 6791 case Instruction::UDiv: 6792 case Instruction::UIToFP: 6793 case Instruction::URem: 6794 case Instruction::Xor: 6795 case Instruction::ZExt: 6796 return true; 6797 } 6798 return false; 6799 }; 6800 6801 if (!IsVectorizableOpcode(I->getOpcode())) 6802 return false; 6803 6804 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6805 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6806 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6807 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6808 return false; 6809 } 6810 6811 auto willWiden = [&](unsigned VF) -> bool { 6812 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || 6813 CM.isProfitableToScalarize(I, VF))) 6814 return false; 6815 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6816 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6817 // The following case may be scalarized depending on the VF. 6818 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6819 // version of the instruction. 6820 // Is it beneficial to perform intrinsic call compared to lib call? 6821 bool NeedToScalarize; 6822 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6823 bool UseVectorIntrinsic = 6824 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6825 return UseVectorIntrinsic || !NeedToScalarize; 6826 } 6827 if (isa<LoadInst>(I) || isa<StoreInst>(I)) { 6828 assert(CM.getWideningDecision(I, VF) == 6829 LoopVectorizationCostModel::CM_Scalarize && 6830 "Memory widening decisions should have been taken care by now"); 6831 return false; 6832 } 6833 return true; 6834 }; 6835 6836 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6837 return false; 6838 6839 // Success: widen this instruction. We optimize the common case where 6840 // consecutive instructions can be represented by a single recipe. 6841 if (!VPBB->empty()) { 6842 VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back()); 6843 if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I)) 6844 return true; 6845 } 6846 6847 VPBB->appendRecipe(new VPWidenRecipe(I)); 6848 return true; 6849 } 6850 6851 VPBasicBlock *VPRecipeBuilder::handleReplication( 6852 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 6853 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 6854 VPlanPtr &Plan) { 6855 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 6856 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 6857 Range); 6858 6859 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6860 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6861 6862 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); 6863 6864 // Find if I uses a predicated instruction. If so, it will use its scalar 6865 // value. Avoid hoisting the insert-element which packs the scalar value into 6866 // a vector value, as that happens iff all users use the vector value. 6867 for (auto &Op : I->operands()) 6868 if (auto *PredInst = dyn_cast<Instruction>(Op)) 6869 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 6870 PredInst2Recipe[PredInst]->setAlsoPack(false); 6871 6872 // Finalize the recipe for Instr, first if it is not predicated. 6873 if (!IsPredicated) { 6874 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 6875 VPBB->appendRecipe(Recipe); 6876 return VPBB; 6877 } 6878 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 6879 assert(VPBB->getSuccessors().empty() && 6880 "VPBB has successors when handling predicated replication."); 6881 // Record predicated instructions for above packing optimizations. 6882 PredInst2Recipe[I] = Recipe; 6883 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 6884 VPBlockUtils::insertBlockAfter(Region, VPBB); 6885 auto *RegSucc = new VPBasicBlock(); 6886 VPBlockUtils::insertBlockAfter(RegSucc, Region); 6887 return RegSucc; 6888 } 6889 6890 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 6891 VPRecipeBase *PredRecipe, 6892 VPlanPtr &Plan) { 6893 // Instructions marked for predication are replicated and placed under an 6894 // if-then construct to prevent side-effects. 6895 6896 // Generate recipes to compute the block mask for this region. 6897 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 6898 6899 // Build the triangular if-then region. 6900 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 6901 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 6902 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 6903 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 6904 auto *PHIRecipe = 6905 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 6906 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 6907 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 6908 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 6909 6910 // Note: first set Entry as region entry and then connect successors starting 6911 // from it in order, to propagate the "parent" of each VPBasicBlock. 6912 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 6913 VPBlockUtils::connectBlocks(Pred, Exit); 6914 6915 return Region; 6916 } 6917 6918 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, 6919 VPlanPtr &Plan, VPBasicBlock *VPBB) { 6920 VPRecipeBase *Recipe = nullptr; 6921 // Check if Instr should belong to an interleave memory recipe, or already 6922 // does. In the latter case Instr is irrelevant. 6923 if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) { 6924 VPBB->appendRecipe(Recipe); 6925 return true; 6926 } 6927 6928 // Check if Instr is a memory operation that should be widened. 6929 if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) { 6930 VPBB->appendRecipe(Recipe); 6931 return true; 6932 } 6933 6934 // Check if Instr should form some PHI recipe. 6935 if ((Recipe = tryToOptimizeInduction(Instr, Range))) { 6936 VPBB->appendRecipe(Recipe); 6937 return true; 6938 } 6939 if ((Recipe = tryToBlend(Instr, Plan))) { 6940 VPBB->appendRecipe(Recipe); 6941 return true; 6942 } 6943 if (PHINode *Phi = dyn_cast<PHINode>(Instr)) { 6944 VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); 6945 return true; 6946 } 6947 6948 // Check if Instr is to be widened by a general VPWidenRecipe, after 6949 // having first checked for specific widening recipes that deal with 6950 // Interleave Groups, Inductions and Phi nodes. 6951 if (tryToWiden(Instr, VPBB, Range)) 6952 return true; 6953 6954 return false; 6955 } 6956 6957 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 6958 unsigned MaxVF) { 6959 assert(OrigLoop->empty() && "Inner loop expected."); 6960 6961 // Collect conditions feeding internal conditional branches; they need to be 6962 // represented in VPlan for it to model masking. 6963 SmallPtrSet<Value *, 1> NeedDef; 6964 6965 auto *Latch = OrigLoop->getLoopLatch(); 6966 for (BasicBlock *BB : OrigLoop->blocks()) { 6967 if (BB == Latch) 6968 continue; 6969 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 6970 if (Branch && Branch->isConditional()) 6971 NeedDef.insert(Branch->getCondition()); 6972 } 6973 6974 // If the tail is to be folded by masking, the primary induction variable 6975 // needs to be represented in VPlan for it to model early-exit masking. 6976 // Also, both the Phi and the live-out instruction of each reduction are 6977 // required in order to introduce a select between them in VPlan. 6978 if (CM.foldTailByMasking()) { 6979 NeedDef.insert(Legal->getPrimaryInduction()); 6980 for (auto &Reduction : *Legal->getReductionVars()) { 6981 NeedDef.insert(Reduction.first); 6982 NeedDef.insert(Reduction.second.getLoopExitInstr()); 6983 } 6984 } 6985 6986 // Collect instructions from the original loop that will become trivially dead 6987 // in the vectorized loop. We don't need to vectorize these instructions. For 6988 // example, original induction update instructions can become dead because we 6989 // separately emit induction "steps" when generating code for the new loop. 6990 // Similarly, we create a new latch condition when setting up the structure 6991 // of the new loop, so the old one can become dead. 6992 SmallPtrSet<Instruction *, 4> DeadInstructions; 6993 collectTriviallyDeadInstructions(DeadInstructions); 6994 6995 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6996 VFRange SubRange = {VF, MaxVF + 1}; 6997 VPlans.push_back( 6998 buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions)); 6999 VF = SubRange.End; 7000 } 7001 } 7002 7003 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7004 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7005 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7006 // Hold a mapping from predicated instructions to their recipes, in order to 7007 // fix their AlsoPack behavior if a user is determined to replicate and use a 7008 // scalar instead of vector value. 7009 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7010 7011 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7012 DenseMap<Instruction *, Instruction *> SinkAfterInverse; 7013 7014 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7015 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7016 auto Plan = std::make_unique<VPlan>(VPBB); 7017 7018 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); 7019 // Represent values that will have defs inside VPlan. 7020 for (Value *V : NeedDef) 7021 Plan->addVPValue(V); 7022 7023 // Scan the body of the loop in a topological order to visit each basic block 7024 // after having visited its predecessor basic blocks. 7025 LoopBlocksDFS DFS(OrigLoop); 7026 DFS.perform(LI); 7027 7028 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7029 // Relevant instructions from basic block BB will be grouped into VPRecipe 7030 // ingredients and fill a new VPBasicBlock. 7031 unsigned VPBBsForBB = 0; 7032 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7033 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7034 VPBB = FirstVPBBForBB; 7035 Builder.setInsertPoint(VPBB); 7036 7037 std::vector<Instruction *> Ingredients; 7038 7039 // Organize the ingredients to vectorize from current basic block in the 7040 // right order. 7041 for (Instruction &I : BB->instructionsWithoutDebug()) { 7042 Instruction *Instr = &I; 7043 7044 // First filter out irrelevant instructions, to ensure no recipes are 7045 // built for them. 7046 if (isa<BranchInst>(Instr) || 7047 DeadInstructions.find(Instr) != DeadInstructions.end()) 7048 continue; 7049 7050 // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct 7051 // member of the IG, do not construct any Recipe for it. 7052 const InterleaveGroup<Instruction> *IG = 7053 CM.getInterleavedAccessGroup(Instr); 7054 if (IG && Instr != IG->getInsertPos() && 7055 Range.Start >= 2 && // Query is illegal for VF == 1 7056 CM.getWideningDecision(Instr, Range.Start) == 7057 LoopVectorizationCostModel::CM_Interleave) { 7058 auto SinkCandidate = SinkAfterInverse.find(Instr); 7059 if (SinkCandidate != SinkAfterInverse.end()) 7060 Ingredients.push_back(SinkCandidate->second); 7061 continue; 7062 } 7063 7064 // Move instructions to handle first-order recurrences, step 1: avoid 7065 // handling this instruction until after we've handled the instruction it 7066 // should follow. 7067 auto SAIt = SinkAfter.find(Instr); 7068 if (SAIt != SinkAfter.end()) { 7069 LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" 7070 << *SAIt->second 7071 << " to vectorize a 1st order recurrence.\n"); 7072 SinkAfterInverse[SAIt->second] = Instr; 7073 continue; 7074 } 7075 7076 Ingredients.push_back(Instr); 7077 7078 // Move instructions to handle first-order recurrences, step 2: push the 7079 // instruction to be sunk at its insertion point. 7080 auto SAInvIt = SinkAfterInverse.find(Instr); 7081 if (SAInvIt != SinkAfterInverse.end()) 7082 Ingredients.push_back(SAInvIt->second); 7083 } 7084 7085 // Introduce each ingredient into VPlan. 7086 for (Instruction *Instr : Ingredients) { 7087 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) 7088 continue; 7089 7090 // Otherwise, if all widening options failed, Instruction is to be 7091 // replicated. This may create a successor for VPBB. 7092 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7093 Instr, Range, VPBB, PredInst2Recipe, Plan); 7094 if (NextVPBB != VPBB) { 7095 VPBB = NextVPBB; 7096 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7097 : ""); 7098 } 7099 } 7100 } 7101 7102 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7103 // may also be empty, such as the last one VPBB, reflecting original 7104 // basic-blocks with no recipes. 7105 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7106 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7107 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7108 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7109 delete PreEntry; 7110 7111 // Finally, if tail is folded by masking, introduce selects between the phi 7112 // and the live-out instruction of each reduction, at the end of the latch. 7113 if (CM.foldTailByMasking()) { 7114 Builder.setInsertPoint(VPBB); 7115 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7116 for (auto &Reduction : *Legal->getReductionVars()) { 7117 VPValue *Phi = Plan->getVPValue(Reduction.first); 7118 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7119 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7120 } 7121 } 7122 7123 std::string PlanName; 7124 raw_string_ostream RSO(PlanName); 7125 unsigned VF = Range.Start; 7126 Plan->addVF(VF); 7127 RSO << "Initial VPlan for VF={" << VF; 7128 for (VF *= 2; VF < Range.End; VF *= 2) { 7129 Plan->addVF(VF); 7130 RSO << "," << VF; 7131 } 7132 RSO << "},UF>=1"; 7133 RSO.flush(); 7134 Plan->setName(PlanName); 7135 7136 return Plan; 7137 } 7138 7139 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7140 // Outer loop handling: They may require CFG and instruction level 7141 // transformations before even evaluating whether vectorization is profitable. 7142 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7143 // the vectorization pipeline. 7144 assert(!OrigLoop->empty()); 7145 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7146 7147 // Create new empty VPlan 7148 auto Plan = std::make_unique<VPlan>(); 7149 7150 // Build hierarchical CFG 7151 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7152 HCFGBuilder.buildHierarchicalCFG(); 7153 7154 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7155 Plan->addVF(VF); 7156 7157 if (EnableVPlanPredication) { 7158 VPlanPredicator VPP(*Plan); 7159 VPP.predicate(); 7160 7161 // Avoid running transformation to recipes until masked code generation in 7162 // VPlan-native path is in place. 7163 return Plan; 7164 } 7165 7166 SmallPtrSet<Instruction *, 1> DeadInstructions; 7167 VPlanHCFGTransforms::VPInstructionsToVPRecipes( 7168 Plan, Legal->getInductionVars(), DeadInstructions); 7169 7170 return Plan; 7171 } 7172 7173 Value* LoopVectorizationPlanner::VPCallbackILV:: 7174 getOrCreateVectorValues(Value *V, unsigned Part) { 7175 return ILV.getOrCreateVectorValue(V, Part); 7176 } 7177 7178 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { 7179 O << " +\n" 7180 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7181 IG->getInsertPos()->printAsOperand(O, false); 7182 if (User) { 7183 O << ", "; 7184 User->getOperand(0)->printAsOperand(O); 7185 } 7186 O << "\\l\""; 7187 for (unsigned i = 0; i < IG->getFactor(); ++i) 7188 if (Instruction *I = IG->getMember(i)) 7189 O << " +\n" 7190 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; 7191 } 7192 7193 void VPWidenRecipe::execute(VPTransformState &State) { 7194 for (auto &Instr : make_range(Begin, End)) 7195 State.ILV->widenInstruction(Instr); 7196 } 7197 7198 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7199 assert(!State.Instance && "Int or FP induction being replicated."); 7200 State.ILV->widenIntOrFpInduction(IV, Trunc); 7201 } 7202 7203 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7204 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7205 } 7206 7207 void VPBlendRecipe::execute(VPTransformState &State) { 7208 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7209 // We know that all PHIs in non-header blocks are converted into 7210 // selects, so we don't have to worry about the insertion order and we 7211 // can just use the builder. 7212 // At this point we generate the predication tree. There may be 7213 // duplications since this is a simple recursive scan, but future 7214 // optimizations will clean it up. 7215 7216 unsigned NumIncoming = Phi->getNumIncomingValues(); 7217 7218 assert((User || NumIncoming == 1) && 7219 "Multiple predecessors with predecessors having a full mask"); 7220 // Generate a sequence of selects of the form: 7221 // SELECT(Mask3, In3, 7222 // SELECT(Mask2, In2, 7223 // ( ...))) 7224 InnerLoopVectorizer::VectorParts Entry(State.UF); 7225 for (unsigned In = 0; In < NumIncoming; ++In) { 7226 for (unsigned Part = 0; Part < State.UF; ++Part) { 7227 // We might have single edge PHIs (blocks) - use an identity 7228 // 'select' for the first PHI operand. 7229 Value *In0 = 7230 State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part); 7231 if (In == 0) 7232 Entry[Part] = In0; // Initialize with the first incoming value. 7233 else { 7234 // Select between the current value and the previous incoming edge 7235 // based on the incoming mask. 7236 Value *Cond = State.get(User->getOperand(In), Part); 7237 Entry[Part] = 7238 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7239 } 7240 } 7241 } 7242 for (unsigned Part = 0; Part < State.UF; ++Part) 7243 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7244 } 7245 7246 void VPInterleaveRecipe::execute(VPTransformState &State) { 7247 assert(!State.Instance && "Interleave group being replicated."); 7248 if (!User) 7249 return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); 7250 7251 // Last (and currently only) operand is a mask. 7252 InnerLoopVectorizer::VectorParts MaskValues(State.UF); 7253 VPValue *Mask = User->getOperand(User->getNumOperands() - 1); 7254 for (unsigned Part = 0; Part < State.UF; ++Part) 7255 MaskValues[Part] = State.get(Mask, Part); 7256 State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues); 7257 } 7258 7259 void VPReplicateRecipe::execute(VPTransformState &State) { 7260 if (State.Instance) { // Generate a single instance. 7261 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); 7262 // Insert scalar instance packing it into a vector. 7263 if (AlsoPack && State.VF > 1) { 7264 // If we're constructing lane 0, initialize to start from undef. 7265 if (State.Instance->Lane == 0) { 7266 Value *Undef = 7267 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7268 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7269 } 7270 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7271 } 7272 return; 7273 } 7274 7275 // Generate scalar instances for all VF lanes of all UF parts, unless the 7276 // instruction is uniform inwhich case generate only the first lane for each 7277 // of the UF parts. 7278 unsigned EndLane = IsUniform ? 1 : State.VF; 7279 for (unsigned Part = 0; Part < State.UF; ++Part) 7280 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7281 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); 7282 } 7283 7284 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7285 assert(State.Instance && "Branch on Mask works only on single instance."); 7286 7287 unsigned Part = State.Instance->Part; 7288 unsigned Lane = State.Instance->Lane; 7289 7290 Value *ConditionBit = nullptr; 7291 if (!User) // Block in mask is all-one. 7292 ConditionBit = State.Builder.getTrue(); 7293 else { 7294 VPValue *BlockInMask = User->getOperand(0); 7295 ConditionBit = State.get(BlockInMask, Part); 7296 if (ConditionBit->getType()->isVectorTy()) 7297 ConditionBit = State.Builder.CreateExtractElement( 7298 ConditionBit, State.Builder.getInt32(Lane)); 7299 } 7300 7301 // Replace the temporary unreachable terminator with a new conditional branch, 7302 // whose two destinations will be set later when they are created. 7303 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7304 assert(isa<UnreachableInst>(CurrentTerminator) && 7305 "Expected to replace unreachable terminator with conditional branch."); 7306 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7307 CondBr->setSuccessor(0, nullptr); 7308 ReplaceInstWithInst(CurrentTerminator, CondBr); 7309 } 7310 7311 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7312 assert(State.Instance && "Predicated instruction PHI works per instance."); 7313 Instruction *ScalarPredInst = cast<Instruction>( 7314 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7315 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7316 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7317 assert(PredicatingBB && "Predicated block has no single predecessor."); 7318 7319 // By current pack/unpack logic we need to generate only a single phi node: if 7320 // a vector value for the predicated instruction exists at this point it means 7321 // the instruction has vector users only, and a phi for the vector value is 7322 // needed. In this case the recipe of the predicated instruction is marked to 7323 // also do that packing, thereby "hoisting" the insert-element sequence. 7324 // Otherwise, a phi node for the scalar value is needed. 7325 unsigned Part = State.Instance->Part; 7326 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7327 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7328 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7329 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7330 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7331 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7332 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7333 } else { 7334 Type *PredInstType = PredInst->getType(); 7335 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7336 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7337 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7338 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7339 } 7340 } 7341 7342 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7343 if (!User) 7344 return State.ILV->vectorizeMemoryInstruction(&Instr); 7345 7346 // Last (and currently only) operand is a mask. 7347 InnerLoopVectorizer::VectorParts MaskValues(State.UF); 7348 VPValue *Mask = User->getOperand(User->getNumOperands() - 1); 7349 for (unsigned Part = 0; Part < State.UF; ++Part) 7350 MaskValues[Part] = State.get(Mask, Part); 7351 State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); 7352 } 7353 7354 static ScalarEpilogueLowering 7355 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, 7356 ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { 7357 ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; 7358 if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && 7359 (F->hasOptSize() || 7360 llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) 7361 SEL = CM_ScalarEpilogueNotAllowedOptSize; 7362 else if (PreferPredicateOverEpilog || Hints.getPredicate()) 7363 SEL = CM_ScalarEpilogueNotNeededUsePredicate; 7364 7365 return SEL; 7366 } 7367 7368 // Process the loop in the VPlan-native vectorization path. This path builds 7369 // VPlan upfront in the vectorization pipeline, which allows to apply 7370 // VPlan-to-VPlan transformations from the very beginning without modifying the 7371 // input LLVM IR. 7372 static bool processLoopInVPlanNativePath( 7373 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7374 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7375 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7376 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7377 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7378 7379 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7380 Function *F = L->getHeader()->getParent(); 7381 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7382 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); 7383 7384 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7385 &Hints, IAI); 7386 // Use the planner for outer loop vectorization. 7387 // TODO: CM is not used at this point inside the planner. Turn CM into an 7388 // optional argument if we don't need it in the future. 7389 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM); 7390 7391 // Get user vectorization factor. 7392 const unsigned UserVF = Hints.getWidth(); 7393 7394 // Plan how to best vectorize, return the best VF and its cost. 7395 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7396 7397 // If we are stress testing VPlan builds, do not attempt to generate vector 7398 // code. Masked vector code generation support will follow soon. 7399 // Also, do not attempt to vectorize if no vector code will be produced. 7400 if (VPlanBuildStressTest || EnableVPlanPredication || 7401 VectorizationFactor::Disabled() == VF) 7402 return false; 7403 7404 LVP.setBestPlan(VF.Width, 1); 7405 7406 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7407 &CM); 7408 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7409 << L->getHeader()->getParent()->getName() << "\"\n"); 7410 LVP.executePlan(LB, DT); 7411 7412 // Mark the loop as already vectorized to avoid vectorizing again. 7413 Hints.setAlreadyVectorized(); 7414 7415 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7416 return true; 7417 } 7418 7419 bool LoopVectorizePass::processLoop(Loop *L) { 7420 assert((EnableVPlanNativePath || L->empty()) && 7421 "VPlan-native path is not enabled. Only process inner loops."); 7422 7423 #ifndef NDEBUG 7424 const std::string DebugLocStr = getDebugLocString(L); 7425 #endif /* NDEBUG */ 7426 7427 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7428 << L->getHeader()->getParent()->getName() << "\" from " 7429 << DebugLocStr << "\n"); 7430 7431 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7432 7433 LLVM_DEBUG( 7434 dbgs() << "LV: Loop hints:" 7435 << " force=" 7436 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7437 ? "disabled" 7438 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7439 ? "enabled" 7440 : "?")) 7441 << " width=" << Hints.getWidth() 7442 << " unroll=" << Hints.getInterleave() << "\n"); 7443 7444 // Function containing loop 7445 Function *F = L->getHeader()->getParent(); 7446 7447 // Looking at the diagnostic output is the only way to determine if a loop 7448 // was vectorized (other than looking at the IR or machine code), so it 7449 // is important to generate an optimization remark for each loop. Most of 7450 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7451 // generated as OptimizationRemark and OptimizationRemarkMissed are 7452 // less verbose reporting vectorized loops and unvectorized loops that may 7453 // benefit from vectorization, respectively. 7454 7455 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7456 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7457 return false; 7458 } 7459 7460 PredicatedScalarEvolution PSE(*SE, *L); 7461 7462 // Check if it is legal to vectorize the loop. 7463 LoopVectorizationRequirements Requirements(*ORE); 7464 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7465 &Requirements, &Hints, DB, AC); 7466 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7467 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7468 Hints.emitRemarkWithHints(); 7469 return false; 7470 } 7471 7472 // Check the function attributes and profiles to find out if this function 7473 // should be optimized for size. 7474 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); 7475 7476 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7477 // here. They may require CFG and instruction level transformations before 7478 // even evaluating whether vectorization is profitable. Since we cannot modify 7479 // the incoming IR, we need to build VPlan upfront in the vectorization 7480 // pipeline. 7481 if (!L->empty()) 7482 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7483 ORE, BFI, PSI, Hints); 7484 7485 assert(L->empty() && "Inner loop expected."); 7486 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7487 // count by optimizing for size, to minimize overheads. 7488 // Prefer constant trip counts over profile data, over upper bound estimate. 7489 unsigned ExpectedTC = 0; 7490 bool HasExpectedTC = false; 7491 if (const SCEVConstant *ConstExits = 7492 dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) { 7493 const APInt &ExitsCount = ConstExits->getAPInt(); 7494 // We are interested in small values for ExpectedTC. Skip over those that 7495 // can't fit an unsigned. 7496 if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) { 7497 ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1; 7498 HasExpectedTC = true; 7499 } 7500 } 7501 // ExpectedTC may be large because it's bound by a variable. Check 7502 // profiling information to validate we should vectorize. 7503 if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) { 7504 auto EstimatedTC = getLoopEstimatedTripCount(L); 7505 if (EstimatedTC) { 7506 ExpectedTC = *EstimatedTC; 7507 HasExpectedTC = true; 7508 } 7509 } 7510 if (!HasExpectedTC) { 7511 ExpectedTC = SE->getSmallConstantMaxTripCount(L); 7512 HasExpectedTC = (ExpectedTC > 0); 7513 } 7514 7515 if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) { 7516 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7517 << "This loop is worth vectorizing only if no scalar " 7518 << "iteration overheads are incurred."); 7519 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7520 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7521 else { 7522 LLVM_DEBUG(dbgs() << "\n"); 7523 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7524 } 7525 } 7526 7527 // Check the function attributes to see if implicit floats are allowed. 7528 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7529 // an integer loop and the vector instructions selected are purely integer 7530 // vector instructions? 7531 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7532 reportVectorizationFailure( 7533 "Can't vectorize when the NoImplicitFloat attribute is used", 7534 "loop not vectorized due to NoImplicitFloat attribute", 7535 "NoImplicitFloat", ORE, L); 7536 Hints.emitRemarkWithHints(); 7537 return false; 7538 } 7539 7540 // Check if the target supports potentially unsafe FP vectorization. 7541 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7542 // for the target we're vectorizing for, to make sure none of the 7543 // additional fp-math flags can help. 7544 if (Hints.isPotentiallyUnsafe() && 7545 TTI->isFPVectorizationPotentiallyUnsafe()) { 7546 reportVectorizationFailure( 7547 "Potentially unsafe FP op prevents vectorization", 7548 "loop not vectorized due to unsafe FP support.", 7549 "UnsafeFP", ORE, L); 7550 Hints.emitRemarkWithHints(); 7551 return false; 7552 } 7553 7554 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7555 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7556 7557 // If an override option has been passed in for interleaved accesses, use it. 7558 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7559 UseInterleaved = EnableInterleavedMemAccesses; 7560 7561 // Analyze interleaved memory accesses. 7562 if (UseInterleaved) { 7563 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7564 } 7565 7566 // Use the cost model. 7567 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7568 F, &Hints, IAI); 7569 CM.collectValuesToIgnore(); 7570 7571 // Use the planner for vectorization. 7572 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM); 7573 7574 // Get user vectorization factor. 7575 unsigned UserVF = Hints.getWidth(); 7576 7577 // Plan how to best vectorize, return the best VF and its cost. 7578 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); 7579 7580 VectorizationFactor VF = VectorizationFactor::Disabled(); 7581 unsigned IC = 1; 7582 unsigned UserIC = Hints.getInterleave(); 7583 7584 if (MaybeVF) { 7585 VF = *MaybeVF; 7586 // Select the interleave count. 7587 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7588 } 7589 7590 // Identify the diagnostic messages that should be produced. 7591 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7592 bool VectorizeLoop = true, InterleaveLoop = true; 7593 if (Requirements.doesNotMeet(F, L, Hints)) { 7594 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7595 "requirements.\n"); 7596 Hints.emitRemarkWithHints(); 7597 return false; 7598 } 7599 7600 if (VF.Width == 1) { 7601 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7602 VecDiagMsg = std::make_pair( 7603 "VectorizationNotBeneficial", 7604 "the cost-model indicates that vectorization is not beneficial"); 7605 VectorizeLoop = false; 7606 } 7607 7608 if (!MaybeVF && UserIC > 1) { 7609 // Tell the user interleaving was avoided up-front, despite being explicitly 7610 // requested. 7611 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7612 "interleaving should be avoided up front\n"); 7613 IntDiagMsg = std::make_pair( 7614 "InterleavingAvoided", 7615 "Ignoring UserIC, because interleaving was avoided up front"); 7616 InterleaveLoop = false; 7617 } else if (IC == 1 && UserIC <= 1) { 7618 // Tell the user interleaving is not beneficial. 7619 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7620 IntDiagMsg = std::make_pair( 7621 "InterleavingNotBeneficial", 7622 "the cost-model indicates that interleaving is not beneficial"); 7623 InterleaveLoop = false; 7624 if (UserIC == 1) { 7625 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7626 IntDiagMsg.second += 7627 " and is explicitly disabled or interleave count is set to 1"; 7628 } 7629 } else if (IC > 1 && UserIC == 1) { 7630 // Tell the user interleaving is beneficial, but it explicitly disabled. 7631 LLVM_DEBUG( 7632 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7633 IntDiagMsg = std::make_pair( 7634 "InterleavingBeneficialButDisabled", 7635 "the cost-model indicates that interleaving is beneficial " 7636 "but is explicitly disabled or interleave count is set to 1"); 7637 InterleaveLoop = false; 7638 } 7639 7640 // Override IC if user provided an interleave count. 7641 IC = UserIC > 0 ? UserIC : IC; 7642 7643 // Emit diagnostic messages, if any. 7644 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7645 if (!VectorizeLoop && !InterleaveLoop) { 7646 // Do not vectorize or interleaving the loop. 7647 ORE->emit([&]() { 7648 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7649 L->getStartLoc(), L->getHeader()) 7650 << VecDiagMsg.second; 7651 }); 7652 ORE->emit([&]() { 7653 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7654 L->getStartLoc(), L->getHeader()) 7655 << IntDiagMsg.second; 7656 }); 7657 return false; 7658 } else if (!VectorizeLoop && InterleaveLoop) { 7659 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7660 ORE->emit([&]() { 7661 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7662 L->getStartLoc(), L->getHeader()) 7663 << VecDiagMsg.second; 7664 }); 7665 } else if (VectorizeLoop && !InterleaveLoop) { 7666 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7667 << ") in " << DebugLocStr << '\n'); 7668 ORE->emit([&]() { 7669 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7670 L->getStartLoc(), L->getHeader()) 7671 << IntDiagMsg.second; 7672 }); 7673 } else if (VectorizeLoop && InterleaveLoop) { 7674 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7675 << ") in " << DebugLocStr << '\n'); 7676 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7677 } 7678 7679 LVP.setBestPlan(VF.Width, IC); 7680 7681 using namespace ore; 7682 bool DisableRuntimeUnroll = false; 7683 MDNode *OrigLoopID = L->getLoopID(); 7684 7685 if (!VectorizeLoop) { 7686 assert(IC > 1 && "interleave count should not be 1 or 0"); 7687 // If we decided that it is not legal to vectorize the loop, then 7688 // interleave it. 7689 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7690 &CM); 7691 LVP.executePlan(Unroller, DT); 7692 7693 ORE->emit([&]() { 7694 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7695 L->getHeader()) 7696 << "interleaved loop (interleaved count: " 7697 << NV("InterleaveCount", IC) << ")"; 7698 }); 7699 } else { 7700 // If we decided that it is *legal* to vectorize the loop, then do it. 7701 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7702 &LVL, &CM); 7703 LVP.executePlan(LB, DT); 7704 ++LoopsVectorized; 7705 7706 // Add metadata to disable runtime unrolling a scalar loop when there are 7707 // no runtime checks about strides and memory. A scalar loop that is 7708 // rarely used is not worth unrolling. 7709 if (!LB.areSafetyChecksAdded()) 7710 DisableRuntimeUnroll = true; 7711 7712 // Report the vectorization decision. 7713 ORE->emit([&]() { 7714 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7715 L->getHeader()) 7716 << "vectorized loop (vectorization width: " 7717 << NV("VectorizationFactor", VF.Width) 7718 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7719 }); 7720 } 7721 7722 Optional<MDNode *> RemainderLoopID = 7723 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7724 LLVMLoopVectorizeFollowupEpilogue}); 7725 if (RemainderLoopID.hasValue()) { 7726 L->setLoopID(RemainderLoopID.getValue()); 7727 } else { 7728 if (DisableRuntimeUnroll) 7729 AddRuntimeUnrollDisableMetaData(L); 7730 7731 // Mark the loop as already vectorized to avoid vectorizing again. 7732 Hints.setAlreadyVectorized(); 7733 } 7734 7735 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7736 return true; 7737 } 7738 7739 bool LoopVectorizePass::runImpl( 7740 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7741 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7742 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 7743 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 7744 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 7745 SE = &SE_; 7746 LI = &LI_; 7747 TTI = &TTI_; 7748 DT = &DT_; 7749 BFI = &BFI_; 7750 TLI = TLI_; 7751 AA = &AA_; 7752 AC = &AC_; 7753 GetLAA = &GetLAA_; 7754 DB = &DB_; 7755 ORE = &ORE_; 7756 PSI = PSI_; 7757 7758 // Don't attempt if 7759 // 1. the target claims to have no vector registers, and 7760 // 2. interleaving won't help ILP. 7761 // 7762 // The second condition is necessary because, even if the target has no 7763 // vector registers, loop vectorization may still enable scalar 7764 // interleaving. 7765 if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) 7766 return false; 7767 7768 bool Changed = false; 7769 7770 // The vectorizer requires loops to be in simplified form. 7771 // Since simplification may add new inner loops, it has to run before the 7772 // legality and profitability checks. This means running the loop vectorizer 7773 // will simplify all loops, regardless of whether anything end up being 7774 // vectorized. 7775 for (auto &L : *LI) 7776 Changed |= 7777 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 7778 7779 // Build up a worklist of inner-loops to vectorize. This is necessary as 7780 // the act of vectorizing or partially unrolling a loop creates new loops 7781 // and can invalidate iterators across the loops. 7782 SmallVector<Loop *, 8> Worklist; 7783 7784 for (Loop *L : *LI) 7785 collectSupportedLoops(*L, LI, ORE, Worklist); 7786 7787 LoopsAnalyzed += Worklist.size(); 7788 7789 // Now walk the identified inner loops. 7790 while (!Worklist.empty()) { 7791 Loop *L = Worklist.pop_back_val(); 7792 7793 // For the inner loops we actually process, form LCSSA to simplify the 7794 // transform. 7795 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 7796 7797 Changed |= processLoop(L); 7798 } 7799 7800 // Process each loop nest in the function. 7801 return Changed; 7802 } 7803 7804 PreservedAnalyses LoopVectorizePass::run(Function &F, 7805 FunctionAnalysisManager &AM) { 7806 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 7807 auto &LI = AM.getResult<LoopAnalysis>(F); 7808 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 7809 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 7810 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 7811 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 7812 auto &AA = AM.getResult<AAManager>(F); 7813 auto &AC = AM.getResult<AssumptionAnalysis>(F); 7814 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 7815 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 7816 MemorySSA *MSSA = EnableMSSALoopDependency 7817 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 7818 : nullptr; 7819 7820 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 7821 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 7822 [&](Loop &L) -> const LoopAccessInfo & { 7823 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 7824 return LAM.getResult<LoopAccessAnalysis>(L, AR); 7825 }; 7826 const ModuleAnalysisManager &MAM = 7827 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); 7828 ProfileSummaryInfo *PSI = 7829 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 7830 bool Changed = 7831 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 7832 if (!Changed) 7833 return PreservedAnalyses::all(); 7834 PreservedAnalyses PA; 7835 7836 // We currently do not preserve loopinfo/dominator analyses with outer loop 7837 // vectorization. Until this is addressed, mark these analyses as preserved 7838 // only for non-VPlan-native path. 7839 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 7840 if (!EnableVPlanNativePath) { 7841 PA.preserve<LoopAnalysis>(); 7842 PA.preserve<DominatorTreeAnalysis>(); 7843 } 7844 PA.preserve<BasicAA>(); 7845 PA.preserve<GlobalsAA>(); 7846 return PA; 7847 } 7848