1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanHCFGTransforms.h" 62 #include "VPlanPredicator.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpander.h" 95 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 96 #include "llvm/Analysis/TargetLibraryInfo.h" 97 #include "llvm/Analysis/TargetTransformInfo.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfoMetadata.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/LLVMContext.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/LoopSimplify.h" 137 #include "llvm/Transforms/Utils/LoopUtils.h" 138 #include "llvm/Transforms/Utils/LoopVersioning.h" 139 #include "llvm/Transforms/Utils/SizeOpts.h" 140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 141 #include <algorithm> 142 #include <cassert> 143 #include <cstdint> 144 #include <cstdlib> 145 #include <functional> 146 #include <iterator> 147 #include <limits> 148 #include <memory> 149 #include <string> 150 #include <tuple> 151 #include <utility> 152 #include <vector> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 /// @{ 160 /// Metadata attribute names 161 static const char *const LLVMLoopVectorizeFollowupAll = 162 "llvm.loop.vectorize.followup_all"; 163 static const char *const LLVMLoopVectorizeFollowupVectorized = 164 "llvm.loop.vectorize.followup_vectorized"; 165 static const char *const LLVMLoopVectorizeFollowupEpilogue = 166 "llvm.loop.vectorize.followup_epilogue"; 167 /// @} 168 169 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 171 172 /// Loops with a known constant trip count below this number are vectorized only 173 /// if no scalar iteration overheads are incurred. 174 static cl::opt<unsigned> TinyTripCountVectorThreshold( 175 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 176 cl::desc("Loops with a constant trip count that is smaller than this " 177 "value are vectorized only if no scalar iteration overheads " 178 "are incurred.")); 179 180 // Indicates that an epilogue is undesired, predication is preferred. 181 // This means that the vectorizer will try to fold the loop-tail (epilogue) 182 // into the loop and predicate the loop body accordingly. 183 static cl::opt<bool> PreferPredicateOverEpilog( 184 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 185 cl::desc("Indicate that an epilogue is undesired, predication should be " 186 "used instead.")); 187 188 static cl::opt<bool> MaximizeBandwidth( 189 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 190 cl::desc("Maximize bandwidth when selecting vectorization factor which " 191 "will be determined by the smallest type in loop.")); 192 193 static cl::opt<bool> EnableInterleavedMemAccesses( 194 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 195 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 196 197 /// An interleave-group may need masking if it resides in a block that needs 198 /// predication, or in order to mask away gaps. 199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 200 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 201 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 202 203 /// We don't interleave loops with a known constant trip count below this 204 /// number. 205 static const unsigned TinyTripCountInterleaveThreshold = 128; 206 207 static cl::opt<unsigned> ForceTargetNumScalarRegs( 208 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 209 cl::desc("A flag that overrides the target's number of scalar registers.")); 210 211 static cl::opt<unsigned> ForceTargetNumVectorRegs( 212 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 213 cl::desc("A flag that overrides the target's number of vector registers.")); 214 215 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 216 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 217 cl::desc("A flag that overrides the target's max interleave factor for " 218 "scalar loops.")); 219 220 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 221 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 222 cl::desc("A flag that overrides the target's max interleave factor for " 223 "vectorized loops.")); 224 225 static cl::opt<unsigned> ForceTargetInstructionCost( 226 "force-target-instruction-cost", cl::init(0), cl::Hidden, 227 cl::desc("A flag that overrides the target's expected cost for " 228 "an instruction to a single constant value. Mostly " 229 "useful for getting consistent testing.")); 230 231 static cl::opt<unsigned> SmallLoopCost( 232 "small-loop-cost", cl::init(20), cl::Hidden, 233 cl::desc( 234 "The cost of a loop that is considered 'small' by the interleaver.")); 235 236 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 237 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 238 cl::desc("Enable the use of the block frequency analysis to access PGO " 239 "heuristics minimizing code growth in cold regions and being more " 240 "aggressive in hot regions.")); 241 242 // Runtime interleave loops for load/store throughput. 243 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 244 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 245 cl::desc( 246 "Enable runtime interleaving until load/store ports are saturated")); 247 248 /// The number of stores in a loop that are allowed to need predication. 249 static cl::opt<unsigned> NumberOfStoresToPredicate( 250 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 251 cl::desc("Max number of stores to be predicated behind an if.")); 252 253 static cl::opt<bool> EnableIndVarRegisterHeur( 254 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 255 cl::desc("Count the induction variable only once when interleaving")); 256 257 static cl::opt<bool> EnableCondStoresVectorization( 258 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 259 cl::desc("Enable if predication of stores during vectorization.")); 260 261 static cl::opt<unsigned> MaxNestedScalarReductionIC( 262 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 263 cl::desc("The maximum interleave count to use when interleaving a scalar " 264 "reduction in a nested loop.")); 265 266 cl::opt<bool> EnableVPlanNativePath( 267 "enable-vplan-native-path", cl::init(false), cl::Hidden, 268 cl::desc("Enable VPlan-native vectorization path with " 269 "support for outer loop vectorization.")); 270 271 // FIXME: Remove this switch once we have divergence analysis. Currently we 272 // assume divergent non-backedge branches when this switch is true. 273 cl::opt<bool> EnableVPlanPredication( 274 "enable-vplan-predication", cl::init(false), cl::Hidden, 275 cl::desc("Enable VPlan-native vectorization path predicator with " 276 "support for outer loop vectorization.")); 277 278 // This flag enables the stress testing of the VPlan H-CFG construction in the 279 // VPlan-native vectorization path. It must be used in conjuction with 280 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 281 // verification of the H-CFGs built. 282 static cl::opt<bool> VPlanBuildStressTest( 283 "vplan-build-stress-test", cl::init(false), cl::Hidden, 284 cl::desc( 285 "Build VPlan for every supported loop nest in the function and bail " 286 "out right after the build (stress test the VPlan H-CFG construction " 287 "in the VPlan-native vectorization path).")); 288 289 cl::opt<bool> llvm::EnableLoopInterleaving( 290 "interleave-loops", cl::init(true), cl::Hidden, 291 cl::desc("Enable loop interleaving in Loop vectorization passes")); 292 cl::opt<bool> llvm::EnableLoopVectorization( 293 "vectorize-loops", cl::init(true), cl::Hidden, 294 cl::desc("Run the Loop vectorization passes")); 295 296 /// A helper function for converting Scalar types to vector types. 297 /// If the incoming type is void, we return void. If the VF is 1, we return 298 /// the scalar type. 299 static Type *ToVectorTy(Type *Scalar, unsigned VF) { 300 if (Scalar->isVoidTy() || VF == 1) 301 return Scalar; 302 return VectorType::get(Scalar, VF); 303 } 304 305 /// A helper function that returns the type of loaded or stored value. 306 static Type *getMemInstValueType(Value *I) { 307 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 308 "Expected Load or Store instruction"); 309 if (auto *LI = dyn_cast<LoadInst>(I)) 310 return LI->getType(); 311 return cast<StoreInst>(I)->getValueOperand()->getType(); 312 } 313 314 /// A helper function that returns true if the given type is irregular. The 315 /// type is irregular if its allocated size doesn't equal the store size of an 316 /// element of the corresponding vector type at the given vectorization factor. 317 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 318 // Determine if an array of VF elements of type Ty is "bitcast compatible" 319 // with a <VF x Ty> vector. 320 if (VF > 1) { 321 auto *VectorTy = VectorType::get(Ty, VF); 322 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 323 } 324 325 // If the vectorization factor is one, we just check if an array of type Ty 326 // requires padding between elements. 327 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 328 } 329 330 /// A helper function that returns the reciprocal of the block probability of 331 /// predicated blocks. If we return X, we are assuming the predicated block 332 /// will execute once for every X iterations of the loop header. 333 /// 334 /// TODO: We should use actual block probability here, if available. Currently, 335 /// we always assume predicated blocks have a 50% chance of executing. 336 static unsigned getReciprocalPredBlockProb() { return 2; } 337 338 /// A helper function that adds a 'fast' flag to floating-point operations. 339 static Value *addFastMathFlag(Value *V) { 340 if (isa<FPMathOperator>(V)) 341 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 342 return V; 343 } 344 345 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 346 if (isa<FPMathOperator>(V)) 347 cast<Instruction>(V)->setFastMathFlags(FMF); 348 return V; 349 } 350 351 /// A helper function that returns an integer or floating-point constant with 352 /// value C. 353 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 354 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 355 : ConstantFP::get(Ty, C); 356 } 357 358 namespace llvm { 359 360 /// InnerLoopVectorizer vectorizes loops which contain only one basic 361 /// block to a specified vectorization factor (VF). 362 /// This class performs the widening of scalars into vectors, or multiple 363 /// scalars. This class also implements the following features: 364 /// * It inserts an epilogue loop for handling loops that don't have iteration 365 /// counts that are known to be a multiple of the vectorization factor. 366 /// * It handles the code generation for reduction variables. 367 /// * Scalarization (implementation using scalars) of un-vectorizable 368 /// instructions. 369 /// InnerLoopVectorizer does not perform any vectorization-legality 370 /// checks, and relies on the caller to check for the different legality 371 /// aspects. The InnerLoopVectorizer relies on the 372 /// LoopVectorizationLegality class to provide information about the induction 373 /// and reduction variables that were found to a given vectorization factor. 374 class InnerLoopVectorizer { 375 public: 376 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 377 LoopInfo *LI, DominatorTree *DT, 378 const TargetLibraryInfo *TLI, 379 const TargetTransformInfo *TTI, AssumptionCache *AC, 380 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 381 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 382 LoopVectorizationCostModel *CM) 383 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 384 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 385 Builder(PSE.getSE()->getContext()), 386 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 387 virtual ~InnerLoopVectorizer() = default; 388 389 /// Create a new empty loop. Unlink the old loop and connect the new one. 390 /// Return the pre-header block of the new loop. 391 BasicBlock *createVectorizedLoopSkeleton(); 392 393 /// Widen a single instruction within the innermost loop. 394 void widenInstruction(Instruction &I); 395 396 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 397 void fixVectorizedLoop(); 398 399 // Return true if any runtime check is added. 400 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 401 402 /// A type for vectorized values in the new loop. Each value from the 403 /// original loop, when vectorized, is represented by UF vector values in the 404 /// new unrolled loop, where UF is the unroll factor. 405 using VectorParts = SmallVector<Value *, 2>; 406 407 /// Vectorize a single PHINode in a block. This method handles the induction 408 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 409 /// arbitrary length vectors. 410 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 411 412 /// A helper function to scalarize a single Instruction in the innermost loop. 413 /// Generates a sequence of scalar instances for each lane between \p MinLane 414 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 415 /// inclusive.. 416 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, 417 bool IfPredicateInstr); 418 419 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 420 /// is provided, the integer induction variable will first be truncated to 421 /// the corresponding type. 422 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 423 424 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 425 /// vector or scalar value on-demand if one is not yet available. When 426 /// vectorizing a loop, we visit the definition of an instruction before its 427 /// uses. When visiting the definition, we either vectorize or scalarize the 428 /// instruction, creating an entry for it in the corresponding map. (In some 429 /// cases, such as induction variables, we will create both vector and scalar 430 /// entries.) Then, as we encounter uses of the definition, we derive values 431 /// for each scalar or vector use unless such a value is already available. 432 /// For example, if we scalarize a definition and one of its uses is vector, 433 /// we build the required vector on-demand with an insertelement sequence 434 /// when visiting the use. Otherwise, if the use is scalar, we can use the 435 /// existing scalar definition. 436 /// 437 /// Return a value in the new loop corresponding to \p V from the original 438 /// loop at unroll index \p Part. If the value has already been vectorized, 439 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 440 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 441 /// a new vector value on-demand by inserting the scalar values into a vector 442 /// with an insertelement sequence. If the value has been neither vectorized 443 /// nor scalarized, it must be loop invariant, so we simply broadcast the 444 /// value into a vector. 445 Value *getOrCreateVectorValue(Value *V, unsigned Part); 446 447 /// Return a value in the new loop corresponding to \p V from the original 448 /// loop at unroll and vector indices \p Instance. If the value has been 449 /// vectorized but not scalarized, the necessary extractelement instruction 450 /// will be generated. 451 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 452 453 /// Construct the vector value of a scalarized value \p V one lane at a time. 454 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 455 456 /// Try to vectorize the interleaved access group that \p Instr belongs to, 457 /// optionally masking the vector operations if \p BlockInMask is non-null. 458 void vectorizeInterleaveGroup(Instruction *Instr, 459 VectorParts *BlockInMask = nullptr); 460 461 /// Vectorize Load and Store instructions, optionally masking the vector 462 /// operations if \p BlockInMask is non-null. 463 void vectorizeMemoryInstruction(Instruction *Instr, 464 VectorParts *BlockInMask = nullptr); 465 466 /// Set the debug location in the builder using the debug location in 467 /// the instruction. 468 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 469 470 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 471 void fixNonInductionPHIs(void); 472 473 protected: 474 friend class LoopVectorizationPlanner; 475 476 /// A small list of PHINodes. 477 using PhiVector = SmallVector<PHINode *, 4>; 478 479 /// A type for scalarized values in the new loop. Each value from the 480 /// original loop, when scalarized, is represented by UF x VF scalar values 481 /// in the new unrolled loop, where UF is the unroll factor and VF is the 482 /// vectorization factor. 483 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 484 485 /// Set up the values of the IVs correctly when exiting the vector loop. 486 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 487 Value *CountRoundDown, Value *EndValue, 488 BasicBlock *MiddleBlock); 489 490 /// Create a new induction variable inside L. 491 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 492 Value *Step, Instruction *DL); 493 494 /// Handle all cross-iteration phis in the header. 495 void fixCrossIterationPHIs(); 496 497 /// Fix a first-order recurrence. This is the second phase of vectorizing 498 /// this phi node. 499 void fixFirstOrderRecurrence(PHINode *Phi); 500 501 /// Fix a reduction cross-iteration phi. This is the second phase of 502 /// vectorizing this phi node. 503 void fixReduction(PHINode *Phi); 504 505 /// The Loop exit block may have single value PHI nodes with some 506 /// incoming value. While vectorizing we only handled real values 507 /// that were defined inside the loop and we should have one value for 508 /// each predecessor of its parent basic block. See PR14725. 509 void fixLCSSAPHIs(); 510 511 /// Iteratively sink the scalarized operands of a predicated instruction into 512 /// the block that was created for it. 513 void sinkScalarOperands(Instruction *PredInst); 514 515 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 516 /// represented as. 517 void truncateToMinimalBitwidths(); 518 519 /// Insert the new loop to the loop hierarchy and pass manager 520 /// and update the analysis passes. 521 void updateAnalysis(); 522 523 /// Create a broadcast instruction. This method generates a broadcast 524 /// instruction (shuffle) for loop invariant values and for the induction 525 /// value. If this is the induction variable then we extend it to N, N+1, ... 526 /// this is needed because each iteration in the loop corresponds to a SIMD 527 /// element. 528 virtual Value *getBroadcastInstrs(Value *V); 529 530 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 531 /// to each vector element of Val. The sequence starts at StartIndex. 532 /// \p Opcode is relevant for FP induction variable. 533 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 534 Instruction::BinaryOps Opcode = 535 Instruction::BinaryOpsEnd); 536 537 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 538 /// variable on which to base the steps, \p Step is the size of the step, and 539 /// \p EntryVal is the value from the original loop that maps to the steps. 540 /// Note that \p EntryVal doesn't have to be an induction variable - it 541 /// can also be a truncate instruction. 542 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 543 const InductionDescriptor &ID); 544 545 /// Create a vector induction phi node based on an existing scalar one. \p 546 /// EntryVal is the value from the original loop that maps to the vector phi 547 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 548 /// truncate instruction, instead of widening the original IV, we widen a 549 /// version of the IV truncated to \p EntryVal's type. 550 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 551 Value *Step, Instruction *EntryVal); 552 553 /// Returns true if an instruction \p I should be scalarized instead of 554 /// vectorized for the chosen vectorization factor. 555 bool shouldScalarizeInstruction(Instruction *I) const; 556 557 /// Returns true if we should generate a scalar version of \p IV. 558 bool needsScalarInduction(Instruction *IV) const; 559 560 /// If there is a cast involved in the induction variable \p ID, which should 561 /// be ignored in the vectorized loop body, this function records the 562 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 563 /// cast. We had already proved that the casted Phi is equal to the uncasted 564 /// Phi in the vectorized loop (under a runtime guard), and therefore 565 /// there is no need to vectorize the cast - the same value can be used in the 566 /// vector loop for both the Phi and the cast. 567 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 568 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 569 /// 570 /// \p EntryVal is the value from the original loop that maps to the vector 571 /// phi node and is used to distinguish what is the IV currently being 572 /// processed - original one (if \p EntryVal is a phi corresponding to the 573 /// original IV) or the "newly-created" one based on the proof mentioned above 574 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 575 /// latter case \p EntryVal is a TruncInst and we must not record anything for 576 /// that IV, but it's error-prone to expect callers of this routine to care 577 /// about that, hence this explicit parameter. 578 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 579 const Instruction *EntryVal, 580 Value *VectorLoopValue, 581 unsigned Part, 582 unsigned Lane = UINT_MAX); 583 584 /// Generate a shuffle sequence that will reverse the vector Vec. 585 virtual Value *reverseVector(Value *Vec); 586 587 /// Returns (and creates if needed) the original loop trip count. 588 Value *getOrCreateTripCount(Loop *NewLoop); 589 590 /// Returns (and creates if needed) the trip count of the widened loop. 591 Value *getOrCreateVectorTripCount(Loop *NewLoop); 592 593 /// Returns a bitcasted value to the requested vector type. 594 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 595 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 596 const DataLayout &DL); 597 598 /// Emit a bypass check to see if the vector trip count is zero, including if 599 /// it overflows. 600 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 601 602 /// Emit a bypass check to see if all of the SCEV assumptions we've 603 /// had to make are correct. 604 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 605 606 /// Emit bypass checks to check any memory assumptions we may have made. 607 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 608 609 /// Compute the transformed value of Index at offset StartValue using step 610 /// StepValue. 611 /// For integer induction, returns StartValue + Index * StepValue. 612 /// For pointer induction, returns StartValue[Index * StepValue]. 613 /// FIXME: The newly created binary instructions should contain nsw/nuw 614 /// flags, which can be found from the original scalar operations. 615 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 616 const DataLayout &DL, 617 const InductionDescriptor &ID) const; 618 619 /// Add additional metadata to \p To that was not present on \p Orig. 620 /// 621 /// Currently this is used to add the noalias annotations based on the 622 /// inserted memchecks. Use this for instructions that are *cloned* into the 623 /// vector loop. 624 void addNewMetadata(Instruction *To, const Instruction *Orig); 625 626 /// Add metadata from one instruction to another. 627 /// 628 /// This includes both the original MDs from \p From and additional ones (\see 629 /// addNewMetadata). Use this for *newly created* instructions in the vector 630 /// loop. 631 void addMetadata(Instruction *To, Instruction *From); 632 633 /// Similar to the previous function but it adds the metadata to a 634 /// vector of instructions. 635 void addMetadata(ArrayRef<Value *> To, Instruction *From); 636 637 /// The original loop. 638 Loop *OrigLoop; 639 640 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 641 /// dynamic knowledge to simplify SCEV expressions and converts them to a 642 /// more usable form. 643 PredicatedScalarEvolution &PSE; 644 645 /// Loop Info. 646 LoopInfo *LI; 647 648 /// Dominator Tree. 649 DominatorTree *DT; 650 651 /// Alias Analysis. 652 AliasAnalysis *AA; 653 654 /// Target Library Info. 655 const TargetLibraryInfo *TLI; 656 657 /// Target Transform Info. 658 const TargetTransformInfo *TTI; 659 660 /// Assumption Cache. 661 AssumptionCache *AC; 662 663 /// Interface to emit optimization remarks. 664 OptimizationRemarkEmitter *ORE; 665 666 /// LoopVersioning. It's only set up (non-null) if memchecks were 667 /// used. 668 /// 669 /// This is currently only used to add no-alias metadata based on the 670 /// memchecks. The actually versioning is performed manually. 671 std::unique_ptr<LoopVersioning> LVer; 672 673 /// The vectorization SIMD factor to use. Each vector will have this many 674 /// vector elements. 675 unsigned VF; 676 677 /// The vectorization unroll factor to use. Each scalar is vectorized to this 678 /// many different vector instructions. 679 unsigned UF; 680 681 /// The builder that we use 682 IRBuilder<> Builder; 683 684 // --- Vectorization state --- 685 686 /// The vector-loop preheader. 687 BasicBlock *LoopVectorPreHeader; 688 689 /// The scalar-loop preheader. 690 BasicBlock *LoopScalarPreHeader; 691 692 /// Middle Block between the vector and the scalar. 693 BasicBlock *LoopMiddleBlock; 694 695 /// The ExitBlock of the scalar loop. 696 BasicBlock *LoopExitBlock; 697 698 /// The vector loop body. 699 BasicBlock *LoopVectorBody; 700 701 /// The scalar loop body. 702 BasicBlock *LoopScalarBody; 703 704 /// A list of all bypass blocks. The first block is the entry of the loop. 705 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 706 707 /// The new Induction variable which was added to the new block. 708 PHINode *Induction = nullptr; 709 710 /// The induction variable of the old basic block. 711 PHINode *OldInduction = nullptr; 712 713 /// Maps values from the original loop to their corresponding values in the 714 /// vectorized loop. A key value can map to either vector values, scalar 715 /// values or both kinds of values, depending on whether the key was 716 /// vectorized and scalarized. 717 VectorizerValueMap VectorLoopValueMap; 718 719 /// Store instructions that were predicated. 720 SmallVector<Instruction *, 4> PredicatedInstructions; 721 722 /// Trip count of the original loop. 723 Value *TripCount = nullptr; 724 725 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 726 Value *VectorTripCount = nullptr; 727 728 /// The legality analysis. 729 LoopVectorizationLegality *Legal; 730 731 /// The profitablity analysis. 732 LoopVectorizationCostModel *Cost; 733 734 // Record whether runtime checks are added. 735 bool AddedSafetyChecks = false; 736 737 // Holds the end values for each induction variable. We save the end values 738 // so we can later fix-up the external users of the induction variables. 739 DenseMap<PHINode *, Value *> IVEndValues; 740 741 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 742 // fixed up at the end of vector code generation. 743 SmallVector<PHINode *, 8> OrigPHIsToFix; 744 }; 745 746 class InnerLoopUnroller : public InnerLoopVectorizer { 747 public: 748 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 749 LoopInfo *LI, DominatorTree *DT, 750 const TargetLibraryInfo *TLI, 751 const TargetTransformInfo *TTI, AssumptionCache *AC, 752 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 753 LoopVectorizationLegality *LVL, 754 LoopVectorizationCostModel *CM) 755 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 756 UnrollFactor, LVL, CM) {} 757 758 private: 759 Value *getBroadcastInstrs(Value *V) override; 760 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 761 Instruction::BinaryOps Opcode = 762 Instruction::BinaryOpsEnd) override; 763 Value *reverseVector(Value *Vec) override; 764 }; 765 766 } // end namespace llvm 767 768 /// Look for a meaningful debug location on the instruction or it's 769 /// operands. 770 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 771 if (!I) 772 return I; 773 774 DebugLoc Empty; 775 if (I->getDebugLoc() != Empty) 776 return I; 777 778 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 779 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 780 if (OpInst->getDebugLoc() != Empty) 781 return OpInst; 782 } 783 784 return I; 785 } 786 787 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 788 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 789 const DILocation *DIL = Inst->getDebugLoc(); 790 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 791 !isa<DbgInfoIntrinsic>(Inst)) { 792 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 793 if (NewDIL) 794 B.SetCurrentDebugLocation(NewDIL.getValue()); 795 else 796 LLVM_DEBUG(dbgs() 797 << "Failed to create new discriminator: " 798 << DIL->getFilename() << " Line: " << DIL->getLine()); 799 } 800 else 801 B.SetCurrentDebugLocation(DIL); 802 } else 803 B.SetCurrentDebugLocation(DebugLoc()); 804 } 805 806 /// Write a record \p DebugMsg about vectorization failure to the debug 807 /// output stream. If \p I is passed, it is an instruction that prevents 808 /// vectorization. 809 #ifndef NDEBUG 810 static void debugVectorizationFailure(const StringRef DebugMsg, 811 Instruction *I) { 812 dbgs() << "LV: Not vectorizing: " << DebugMsg; 813 if (I != nullptr) 814 dbgs() << " " << *I; 815 else 816 dbgs() << '.'; 817 dbgs() << '\n'; 818 } 819 #endif 820 821 /// Create an analysis remark that explains why vectorization failed 822 /// 823 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 824 /// RemarkName is the identifier for the remark. If \p I is passed it is an 825 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 826 /// the location of the remark. \return the remark object that can be 827 /// streamed to. 828 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 829 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 830 Value *CodeRegion = TheLoop->getHeader(); 831 DebugLoc DL = TheLoop->getStartLoc(); 832 833 if (I) { 834 CodeRegion = I->getParent(); 835 // If there is no debug location attached to the instruction, revert back to 836 // using the loop's. 837 if (I->getDebugLoc()) 838 DL = I->getDebugLoc(); 839 } 840 841 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 842 R << "loop not vectorized: "; 843 return R; 844 } 845 846 namespace llvm { 847 848 void reportVectorizationFailure(const StringRef DebugMsg, 849 const StringRef OREMsg, const StringRef ORETag, 850 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 851 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 852 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 853 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 854 ORETag, TheLoop, I) << OREMsg); 855 } 856 857 } // end namespace llvm 858 859 #ifndef NDEBUG 860 /// \return string containing a file name and a line # for the given loop. 861 static std::string getDebugLocString(const Loop *L) { 862 std::string Result; 863 if (L) { 864 raw_string_ostream OS(Result); 865 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 866 LoopDbgLoc.print(OS); 867 else 868 // Just print the module name. 869 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 870 OS.flush(); 871 } 872 return Result; 873 } 874 #endif 875 876 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 877 const Instruction *Orig) { 878 // If the loop was versioned with memchecks, add the corresponding no-alias 879 // metadata. 880 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 881 LVer->annotateInstWithNoAlias(To, Orig); 882 } 883 884 void InnerLoopVectorizer::addMetadata(Instruction *To, 885 Instruction *From) { 886 propagateMetadata(To, From); 887 addNewMetadata(To, From); 888 } 889 890 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 891 Instruction *From) { 892 for (Value *V : To) { 893 if (Instruction *I = dyn_cast<Instruction>(V)) 894 addMetadata(I, From); 895 } 896 } 897 898 namespace llvm { 899 900 // Loop vectorization cost-model hints how the scalar epilogue loop should be 901 // lowered. 902 enum ScalarEpilogueLowering { 903 904 // The default: allowing scalar epilogues. 905 CM_ScalarEpilogueAllowed, 906 907 // Vectorization with OptForSize: don't allow epilogues. 908 CM_ScalarEpilogueNotAllowedOptSize, 909 910 // A special case of vectorisation with OptForSize: loops with a very small 911 // trip count are considered for vectorization under OptForSize, thereby 912 // making sure the cost of their loop body is dominant, free of runtime 913 // guards and scalar iteration overheads. 914 CM_ScalarEpilogueNotAllowedLowTripLoop, 915 916 // Loop hint predicate indicating an epilogue is undesired. 917 CM_ScalarEpilogueNotNeededUsePredicate 918 }; 919 920 /// LoopVectorizationCostModel - estimates the expected speedups due to 921 /// vectorization. 922 /// In many cases vectorization is not profitable. This can happen because of 923 /// a number of reasons. In this class we mainly attempt to predict the 924 /// expected speedup/slowdowns due to the supported instruction set. We use the 925 /// TargetTransformInfo to query the different backends for the cost of 926 /// different operations. 927 class LoopVectorizationCostModel { 928 public: 929 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 930 PredicatedScalarEvolution &PSE, LoopInfo *LI, 931 LoopVectorizationLegality *Legal, 932 const TargetTransformInfo &TTI, 933 const TargetLibraryInfo *TLI, DemandedBits *DB, 934 AssumptionCache *AC, 935 OptimizationRemarkEmitter *ORE, const Function *F, 936 const LoopVectorizeHints *Hints, 937 InterleavedAccessInfo &IAI) 938 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 939 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 940 Hints(Hints), InterleaveInfo(IAI) {} 941 942 /// \return An upper bound for the vectorization factor, or None if 943 /// vectorization and interleaving should be avoided up front. 944 Optional<unsigned> computeMaxVF(); 945 946 /// \return True if runtime checks are required for vectorization, and false 947 /// otherwise. 948 bool runtimeChecksRequired(); 949 950 /// \return The most profitable vectorization factor and the cost of that VF. 951 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 952 /// then this vectorization factor will be selected if vectorization is 953 /// possible. 954 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 955 956 /// Setup cost-based decisions for user vectorization factor. 957 void selectUserVectorizationFactor(unsigned UserVF) { 958 collectUniformsAndScalars(UserVF); 959 collectInstsToScalarize(UserVF); 960 } 961 962 /// \return The size (in bits) of the smallest and widest types in the code 963 /// that needs to be vectorized. We ignore values that remain scalar such as 964 /// 64 bit loop indices. 965 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 966 967 /// \return The desired interleave count. 968 /// If interleave count has been specified by metadata it will be returned. 969 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 970 /// are the selected vectorization factor and the cost of the selected VF. 971 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 972 973 /// Memory access instruction may be vectorized in more than one way. 974 /// Form of instruction after vectorization depends on cost. 975 /// This function takes cost-based decisions for Load/Store instructions 976 /// and collects them in a map. This decisions map is used for building 977 /// the lists of loop-uniform and loop-scalar instructions. 978 /// The calculated cost is saved with widening decision in order to 979 /// avoid redundant calculations. 980 void setCostBasedWideningDecision(unsigned VF); 981 982 /// A struct that represents some properties of the register usage 983 /// of a loop. 984 struct RegisterUsage { 985 /// Holds the number of loop invariant values that are used in the loop. 986 unsigned LoopInvariantRegs; 987 988 /// Holds the maximum number of concurrent live intervals in the loop. 989 unsigned MaxLocalUsers; 990 }; 991 992 /// \return Returns information about the register usages of the loop for the 993 /// given vectorization factors. 994 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 995 996 /// Collect values we want to ignore in the cost model. 997 void collectValuesToIgnore(); 998 999 /// \returns The smallest bitwidth each instruction can be represented with. 1000 /// The vector equivalents of these instructions should be truncated to this 1001 /// type. 1002 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1003 return MinBWs; 1004 } 1005 1006 /// \returns True if it is more profitable to scalarize instruction \p I for 1007 /// vectorization factor \p VF. 1008 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1009 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1010 1011 // Cost model is not run in the VPlan-native path - return conservative 1012 // result until this changes. 1013 if (EnableVPlanNativePath) 1014 return false; 1015 1016 auto Scalars = InstsToScalarize.find(VF); 1017 assert(Scalars != InstsToScalarize.end() && 1018 "VF not yet analyzed for scalarization profitability"); 1019 return Scalars->second.find(I) != Scalars->second.end(); 1020 } 1021 1022 /// Returns true if \p I is known to be uniform after vectorization. 1023 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1024 if (VF == 1) 1025 return true; 1026 1027 // Cost model is not run in the VPlan-native path - return conservative 1028 // result until this changes. 1029 if (EnableVPlanNativePath) 1030 return false; 1031 1032 auto UniformsPerVF = Uniforms.find(VF); 1033 assert(UniformsPerVF != Uniforms.end() && 1034 "VF not yet analyzed for uniformity"); 1035 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); 1036 } 1037 1038 /// Returns true if \p I is known to be scalar after vectorization. 1039 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1040 if (VF == 1) 1041 return true; 1042 1043 // Cost model is not run in the VPlan-native path - return conservative 1044 // result until this changes. 1045 if (EnableVPlanNativePath) 1046 return false; 1047 1048 auto ScalarsPerVF = Scalars.find(VF); 1049 assert(ScalarsPerVF != Scalars.end() && 1050 "Scalar values are not calculated for VF"); 1051 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); 1052 } 1053 1054 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1055 /// for vectorization factor \p VF. 1056 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1057 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1058 !isProfitableToScalarize(I, VF) && 1059 !isScalarAfterVectorization(I, VF); 1060 } 1061 1062 /// Decision that was taken during cost calculation for memory instruction. 1063 enum InstWidening { 1064 CM_Unknown, 1065 CM_Widen, // For consecutive accesses with stride +1. 1066 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1067 CM_Interleave, 1068 CM_GatherScatter, 1069 CM_Scalarize 1070 }; 1071 1072 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1073 /// instruction \p I and vector width \p VF. 1074 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1075 unsigned Cost) { 1076 assert(VF >= 2 && "Expected VF >=2"); 1077 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1078 } 1079 1080 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1081 /// interleaving group \p Grp and vector width \p VF. 1082 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1083 InstWidening W, unsigned Cost) { 1084 assert(VF >= 2 && "Expected VF >=2"); 1085 /// Broadcast this decicion to all instructions inside the group. 1086 /// But the cost will be assigned to one instruction only. 1087 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1088 if (auto *I = Grp->getMember(i)) { 1089 if (Grp->getInsertPos() == I) 1090 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1091 else 1092 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1093 } 1094 } 1095 } 1096 1097 /// Return the cost model decision for the given instruction \p I and vector 1098 /// width \p VF. Return CM_Unknown if this instruction did not pass 1099 /// through the cost modeling. 1100 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1101 assert(VF >= 2 && "Expected VF >=2"); 1102 1103 // Cost model is not run in the VPlan-native path - return conservative 1104 // result until this changes. 1105 if (EnableVPlanNativePath) 1106 return CM_GatherScatter; 1107 1108 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1109 auto Itr = WideningDecisions.find(InstOnVF); 1110 if (Itr == WideningDecisions.end()) 1111 return CM_Unknown; 1112 return Itr->second.first; 1113 } 1114 1115 /// Return the vectorization cost for the given instruction \p I and vector 1116 /// width \p VF. 1117 unsigned getWideningCost(Instruction *I, unsigned VF) { 1118 assert(VF >= 2 && "Expected VF >=2"); 1119 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1120 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1121 "The cost is not calculated"); 1122 return WideningDecisions[InstOnVF].second; 1123 } 1124 1125 /// Return True if instruction \p I is an optimizable truncate whose operand 1126 /// is an induction variable. Such a truncate will be removed by adding a new 1127 /// induction variable with the destination type. 1128 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1129 // If the instruction is not a truncate, return false. 1130 auto *Trunc = dyn_cast<TruncInst>(I); 1131 if (!Trunc) 1132 return false; 1133 1134 // Get the source and destination types of the truncate. 1135 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1136 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1137 1138 // If the truncate is free for the given types, return false. Replacing a 1139 // free truncate with an induction variable would add an induction variable 1140 // update instruction to each iteration of the loop. We exclude from this 1141 // check the primary induction variable since it will need an update 1142 // instruction regardless. 1143 Value *Op = Trunc->getOperand(0); 1144 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1145 return false; 1146 1147 // If the truncated value is not an induction variable, return false. 1148 return Legal->isInductionPhi(Op); 1149 } 1150 1151 /// Collects the instructions to scalarize for each predicated instruction in 1152 /// the loop. 1153 void collectInstsToScalarize(unsigned VF); 1154 1155 /// Collect Uniform and Scalar values for the given \p VF. 1156 /// The sets depend on CM decision for Load/Store instructions 1157 /// that may be vectorized as interleave, gather-scatter or scalarized. 1158 void collectUniformsAndScalars(unsigned VF) { 1159 // Do the analysis once. 1160 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1161 return; 1162 setCostBasedWideningDecision(VF); 1163 collectLoopUniforms(VF); 1164 collectLoopScalars(VF); 1165 } 1166 1167 /// Returns true if the target machine supports masked store operation 1168 /// for the given \p DataType and kind of access to \p Ptr. 1169 bool isLegalMaskedStore(Type *DataType, Value *Ptr) { 1170 return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType); 1171 } 1172 1173 /// Returns true if the target machine supports masked load operation 1174 /// for the given \p DataType and kind of access to \p Ptr. 1175 bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { 1176 return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType); 1177 } 1178 1179 /// Returns true if the target machine supports masked scatter operation 1180 /// for the given \p DataType. 1181 bool isLegalMaskedScatter(Type *DataType) { 1182 return TTI.isLegalMaskedScatter(DataType); 1183 } 1184 1185 /// Returns true if the target machine supports masked gather operation 1186 /// for the given \p DataType. 1187 bool isLegalMaskedGather(Type *DataType) { 1188 return TTI.isLegalMaskedGather(DataType); 1189 } 1190 1191 /// Returns true if the target machine can represent \p V as a masked gather 1192 /// or scatter operation. 1193 bool isLegalGatherOrScatter(Value *V) { 1194 bool LI = isa<LoadInst>(V); 1195 bool SI = isa<StoreInst>(V); 1196 if (!LI && !SI) 1197 return false; 1198 auto *Ty = getMemInstValueType(V); 1199 return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); 1200 } 1201 1202 /// Returns true if \p I is an instruction that will be scalarized with 1203 /// predication. Such instructions include conditional stores and 1204 /// instructions that may divide by zero. 1205 /// If a non-zero VF has been calculated, we check if I will be scalarized 1206 /// predication for that VF. 1207 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1208 1209 // Returns true if \p I is an instruction that will be predicated either 1210 // through scalar predication or masked load/store or masked gather/scatter. 1211 // Superset of instructions that return true for isScalarWithPredication. 1212 bool isPredicatedInst(Instruction *I) { 1213 if (!blockNeedsPredication(I->getParent())) 1214 return false; 1215 // Loads and stores that need some form of masked operation are predicated 1216 // instructions. 1217 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1218 return Legal->isMaskRequired(I); 1219 return isScalarWithPredication(I); 1220 } 1221 1222 /// Returns true if \p I is a memory instruction with consecutive memory 1223 /// access that can be widened. 1224 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1225 1226 /// Returns true if \p I is a memory instruction in an interleaved-group 1227 /// of memory accesses that can be vectorized with wide vector loads/stores 1228 /// and shuffles. 1229 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1230 1231 /// Check if \p Instr belongs to any interleaved access group. 1232 bool isAccessInterleaved(Instruction *Instr) { 1233 return InterleaveInfo.isInterleaved(Instr); 1234 } 1235 1236 /// Get the interleaved access group that \p Instr belongs to. 1237 const InterleaveGroup<Instruction> * 1238 getInterleavedAccessGroup(Instruction *Instr) { 1239 return InterleaveInfo.getInterleaveGroup(Instr); 1240 } 1241 1242 /// Returns true if an interleaved group requires a scalar iteration 1243 /// to handle accesses with gaps, and there is nothing preventing us from 1244 /// creating a scalar epilogue. 1245 bool requiresScalarEpilogue() const { 1246 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1247 } 1248 1249 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1250 /// loop hint annotation. 1251 bool isScalarEpilogueAllowed() const { 1252 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1253 } 1254 1255 /// Returns true if all loop blocks should be masked to fold tail loop. 1256 bool foldTailByMasking() const { return FoldTailByMasking; } 1257 1258 bool blockNeedsPredication(BasicBlock *BB) { 1259 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1260 } 1261 1262 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1263 /// with factor VF. Return the cost of the instruction, including 1264 /// scalarization overhead if it's needed. 1265 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1266 1267 /// Estimate cost of a call instruction CI if it were vectorized with factor 1268 /// VF. Return the cost of the instruction, including scalarization overhead 1269 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1270 /// scalarized - 1271 /// i.e. either vector version isn't available, or is too expensive. 1272 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1273 1274 private: 1275 unsigned NumPredStores = 0; 1276 1277 /// \return An upper bound for the vectorization factor, larger than zero. 1278 /// One is returned if vectorization should best be avoided due to cost. 1279 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1280 1281 /// The vectorization cost is a combination of the cost itself and a boolean 1282 /// indicating whether any of the contributing operations will actually 1283 /// operate on 1284 /// vector values after type legalization in the backend. If this latter value 1285 /// is 1286 /// false, then all operations will be scalarized (i.e. no vectorization has 1287 /// actually taken place). 1288 using VectorizationCostTy = std::pair<unsigned, bool>; 1289 1290 /// Returns the expected execution cost. The unit of the cost does 1291 /// not matter because we use the 'cost' units to compare different 1292 /// vector widths. The cost that is returned is *not* normalized by 1293 /// the factor width. 1294 VectorizationCostTy expectedCost(unsigned VF); 1295 1296 /// Returns the execution time cost of an instruction for a given vector 1297 /// width. Vector width of one means scalar. 1298 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1299 1300 /// The cost-computation logic from getInstructionCost which provides 1301 /// the vector type as an output parameter. 1302 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1303 1304 /// Calculate vectorization cost of memory instruction \p I. 1305 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1306 1307 /// The cost computation for scalarized memory instruction. 1308 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1309 1310 /// The cost computation for interleaving group of memory instructions. 1311 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1312 1313 /// The cost computation for Gather/Scatter instruction. 1314 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1315 1316 /// The cost computation for widening instruction \p I with consecutive 1317 /// memory access. 1318 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1319 1320 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1321 /// Load: scalar load + broadcast. 1322 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1323 /// element) 1324 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1325 1326 /// Estimate the overhead of scalarizing an instruction. This is a 1327 /// convenience wrapper for the type-based getScalarizationOverhead API. 1328 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1329 1330 /// Returns whether the instruction is a load or store and will be a emitted 1331 /// as a vector operation. 1332 bool isConsecutiveLoadOrStore(Instruction *I); 1333 1334 /// Returns true if an artificially high cost for emulated masked memrefs 1335 /// should be used. 1336 bool useEmulatedMaskMemRefHack(Instruction *I); 1337 1338 /// Map of scalar integer values to the smallest bitwidth they can be legally 1339 /// represented as. The vector equivalents of these values should be truncated 1340 /// to this type. 1341 MapVector<Instruction *, uint64_t> MinBWs; 1342 1343 /// A type representing the costs for instructions if they were to be 1344 /// scalarized rather than vectorized. The entries are Instruction-Cost 1345 /// pairs. 1346 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1347 1348 /// A set containing all BasicBlocks that are known to present after 1349 /// vectorization as a predicated block. 1350 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1351 1352 /// Records whether it is allowed to have the original scalar loop execute at 1353 /// least once. This may be needed as a fallback loop in case runtime 1354 /// aliasing/dependence checks fail, or to handle the tail/remainder 1355 /// iterations when the trip count is unknown or doesn't divide by the VF, 1356 /// or as a peel-loop to handle gaps in interleave-groups. 1357 /// Under optsize and when the trip count is very small we don't allow any 1358 /// iterations to execute in the scalar loop. 1359 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1360 1361 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1362 bool FoldTailByMasking = false; 1363 1364 /// A map holding scalar costs for different vectorization factors. The 1365 /// presence of a cost for an instruction in the mapping indicates that the 1366 /// instruction will be scalarized when vectorizing with the associated 1367 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1368 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1369 1370 /// Holds the instructions known to be uniform after vectorization. 1371 /// The data is collected per VF. 1372 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1373 1374 /// Holds the instructions known to be scalar after vectorization. 1375 /// The data is collected per VF. 1376 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1377 1378 /// Holds the instructions (address computations) that are forced to be 1379 /// scalarized. 1380 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1381 1382 /// Returns the expected difference in cost from scalarizing the expression 1383 /// feeding a predicated instruction \p PredInst. The instructions to 1384 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1385 /// non-negative return value implies the expression will be scalarized. 1386 /// Currently, only single-use chains are considered for scalarization. 1387 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1388 unsigned VF); 1389 1390 /// Collect the instructions that are uniform after vectorization. An 1391 /// instruction is uniform if we represent it with a single scalar value in 1392 /// the vectorized loop corresponding to each vector iteration. Examples of 1393 /// uniform instructions include pointer operands of consecutive or 1394 /// interleaved memory accesses. Note that although uniformity implies an 1395 /// instruction will be scalar, the reverse is not true. In general, a 1396 /// scalarized instruction will be represented by VF scalar values in the 1397 /// vectorized loop, each corresponding to an iteration of the original 1398 /// scalar loop. 1399 void collectLoopUniforms(unsigned VF); 1400 1401 /// Collect the instructions that are scalar after vectorization. An 1402 /// instruction is scalar if it is known to be uniform or will be scalarized 1403 /// during vectorization. Non-uniform scalarized instructions will be 1404 /// represented by VF values in the vectorized loop, each corresponding to an 1405 /// iteration of the original scalar loop. 1406 void collectLoopScalars(unsigned VF); 1407 1408 /// Keeps cost model vectorization decision and cost for instructions. 1409 /// Right now it is used for memory instructions only. 1410 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1411 std::pair<InstWidening, unsigned>>; 1412 1413 DecisionList WideningDecisions; 1414 1415 /// Returns true if \p V is expected to be vectorized and it needs to be 1416 /// extracted. 1417 bool needsExtract(Value *V, unsigned VF) const { 1418 Instruction *I = dyn_cast<Instruction>(V); 1419 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1420 return false; 1421 1422 // Assume we can vectorize V (and hence we need extraction) if the 1423 // scalars are not computed yet. This can happen, because it is called 1424 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1425 // the scalars are collected. That should be a safe assumption in most 1426 // cases, because we check if the operands have vectorizable types 1427 // beforehand in LoopVectorizationLegality. 1428 return Scalars.find(VF) == Scalars.end() || 1429 !isScalarAfterVectorization(I, VF); 1430 }; 1431 1432 /// Returns a range containing only operands needing to be extracted. 1433 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1434 unsigned VF) { 1435 return SmallVector<Value *, 4>(make_filter_range( 1436 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1437 } 1438 1439 public: 1440 /// The loop that we evaluate. 1441 Loop *TheLoop; 1442 1443 /// Predicated scalar evolution analysis. 1444 PredicatedScalarEvolution &PSE; 1445 1446 /// Loop Info analysis. 1447 LoopInfo *LI; 1448 1449 /// Vectorization legality. 1450 LoopVectorizationLegality *Legal; 1451 1452 /// Vector target information. 1453 const TargetTransformInfo &TTI; 1454 1455 /// Target Library Info. 1456 const TargetLibraryInfo *TLI; 1457 1458 /// Demanded bits analysis. 1459 DemandedBits *DB; 1460 1461 /// Assumption cache. 1462 AssumptionCache *AC; 1463 1464 /// Interface to emit optimization remarks. 1465 OptimizationRemarkEmitter *ORE; 1466 1467 const Function *TheFunction; 1468 1469 /// Loop Vectorize Hint. 1470 const LoopVectorizeHints *Hints; 1471 1472 /// The interleave access information contains groups of interleaved accesses 1473 /// with the same stride and close to each other. 1474 InterleavedAccessInfo &InterleaveInfo; 1475 1476 /// Values to ignore in the cost model. 1477 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1478 1479 /// Values to ignore in the cost model when VF > 1. 1480 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1481 }; 1482 1483 } // end namespace llvm 1484 1485 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1486 // vectorization. The loop needs to be annotated with #pragma omp simd 1487 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1488 // vector length information is not provided, vectorization is not considered 1489 // explicit. Interleave hints are not allowed either. These limitations will be 1490 // relaxed in the future. 1491 // Please, note that we are currently forced to abuse the pragma 'clang 1492 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1493 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1494 // provides *explicit vectorization hints* (LV can bypass legal checks and 1495 // assume that vectorization is legal). However, both hints are implemented 1496 // using the same metadata (llvm.loop.vectorize, processed by 1497 // LoopVectorizeHints). This will be fixed in the future when the native IR 1498 // representation for pragma 'omp simd' is introduced. 1499 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1500 OptimizationRemarkEmitter *ORE) { 1501 assert(!OuterLp->empty() && "This is not an outer loop"); 1502 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1503 1504 // Only outer loops with an explicit vectorization hint are supported. 1505 // Unannotated outer loops are ignored. 1506 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1507 return false; 1508 1509 Function *Fn = OuterLp->getHeader()->getParent(); 1510 if (!Hints.allowVectorization(Fn, OuterLp, 1511 true /*VectorizeOnlyWhenForced*/)) { 1512 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1513 return false; 1514 } 1515 1516 if (Hints.getInterleave() > 1) { 1517 // TODO: Interleave support is future work. 1518 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1519 "outer loops.\n"); 1520 Hints.emitRemarkWithHints(); 1521 return false; 1522 } 1523 1524 return true; 1525 } 1526 1527 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1528 OptimizationRemarkEmitter *ORE, 1529 SmallVectorImpl<Loop *> &V) { 1530 // Collect inner loops and outer loops without irreducible control flow. For 1531 // now, only collect outer loops that have explicit vectorization hints. If we 1532 // are stress testing the VPlan H-CFG construction, we collect the outermost 1533 // loop of every loop nest. 1534 if (L.empty() || VPlanBuildStressTest || 1535 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1536 LoopBlocksRPO RPOT(&L); 1537 RPOT.perform(LI); 1538 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1539 V.push_back(&L); 1540 // TODO: Collect inner loops inside marked outer loops in case 1541 // vectorization fails for the outer loop. Do not invoke 1542 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1543 // already known to be reducible. We can use an inherited attribute for 1544 // that. 1545 return; 1546 } 1547 } 1548 for (Loop *InnerL : L) 1549 collectSupportedLoops(*InnerL, LI, ORE, V); 1550 } 1551 1552 namespace { 1553 1554 /// The LoopVectorize Pass. 1555 struct LoopVectorize : public FunctionPass { 1556 /// Pass identification, replacement for typeid 1557 static char ID; 1558 1559 LoopVectorizePass Impl; 1560 1561 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1562 bool VectorizeOnlyWhenForced = false) 1563 : FunctionPass(ID) { 1564 Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; 1565 Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; 1566 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1567 } 1568 1569 bool runOnFunction(Function &F) override { 1570 if (skipFunction(F)) 1571 return false; 1572 1573 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1574 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1575 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1576 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1577 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1578 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1579 auto *TLI = TLIP ? &TLIP->getTLI() : nullptr; 1580 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1581 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1582 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1583 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1584 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1585 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1586 1587 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1588 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1589 1590 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1591 GetLAA, *ORE, PSI); 1592 } 1593 1594 void getAnalysisUsage(AnalysisUsage &AU) const override { 1595 AU.addRequired<AssumptionCacheTracker>(); 1596 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1597 AU.addRequired<DominatorTreeWrapperPass>(); 1598 AU.addRequired<LoopInfoWrapperPass>(); 1599 AU.addRequired<ScalarEvolutionWrapperPass>(); 1600 AU.addRequired<TargetTransformInfoWrapperPass>(); 1601 AU.addRequired<AAResultsWrapperPass>(); 1602 AU.addRequired<LoopAccessLegacyAnalysis>(); 1603 AU.addRequired<DemandedBitsWrapperPass>(); 1604 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1605 1606 // We currently do not preserve loopinfo/dominator analyses with outer loop 1607 // vectorization. Until this is addressed, mark these analyses as preserved 1608 // only for non-VPlan-native path. 1609 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1610 if (!EnableVPlanNativePath) { 1611 AU.addPreserved<LoopInfoWrapperPass>(); 1612 AU.addPreserved<DominatorTreeWrapperPass>(); 1613 } 1614 1615 AU.addPreserved<BasicAAWrapperPass>(); 1616 AU.addPreserved<GlobalsAAWrapperPass>(); 1617 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1618 } 1619 }; 1620 1621 } // end anonymous namespace 1622 1623 //===----------------------------------------------------------------------===// 1624 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1625 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1626 //===----------------------------------------------------------------------===// 1627 1628 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1629 // We need to place the broadcast of invariant variables outside the loop, 1630 // but only if it's proven safe to do so. Else, broadcast will be inside 1631 // vector loop body. 1632 Instruction *Instr = dyn_cast<Instruction>(V); 1633 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1634 (!Instr || 1635 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1636 // Place the code for broadcasting invariant variables in the new preheader. 1637 IRBuilder<>::InsertPointGuard Guard(Builder); 1638 if (SafeToHoist) 1639 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1640 1641 // Broadcast the scalar into all locations in the vector. 1642 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1643 1644 return Shuf; 1645 } 1646 1647 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1648 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1649 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1650 "Expected either an induction phi-node or a truncate of it!"); 1651 Value *Start = II.getStartValue(); 1652 1653 // Construct the initial value of the vector IV in the vector loop preheader 1654 auto CurrIP = Builder.saveIP(); 1655 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1656 if (isa<TruncInst>(EntryVal)) { 1657 assert(Start->getType()->isIntegerTy() && 1658 "Truncation requires an integer type"); 1659 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1660 Step = Builder.CreateTrunc(Step, TruncType); 1661 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1662 } 1663 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1664 Value *SteppedStart = 1665 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1666 1667 // We create vector phi nodes for both integer and floating-point induction 1668 // variables. Here, we determine the kind of arithmetic we will perform. 1669 Instruction::BinaryOps AddOp; 1670 Instruction::BinaryOps MulOp; 1671 if (Step->getType()->isIntegerTy()) { 1672 AddOp = Instruction::Add; 1673 MulOp = Instruction::Mul; 1674 } else { 1675 AddOp = II.getInductionOpcode(); 1676 MulOp = Instruction::FMul; 1677 } 1678 1679 // Multiply the vectorization factor by the step using integer or 1680 // floating-point arithmetic as appropriate. 1681 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1682 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1683 1684 // Create a vector splat to use in the induction update. 1685 // 1686 // FIXME: If the step is non-constant, we create the vector splat with 1687 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1688 // handle a constant vector splat. 1689 Value *SplatVF = isa<Constant>(Mul) 1690 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 1691 : Builder.CreateVectorSplat(VF, Mul); 1692 Builder.restoreIP(CurrIP); 1693 1694 // We may need to add the step a number of times, depending on the unroll 1695 // factor. The last of those goes into the PHI. 1696 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1697 &*LoopVectorBody->getFirstInsertionPt()); 1698 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1699 Instruction *LastInduction = VecInd; 1700 for (unsigned Part = 0; Part < UF; ++Part) { 1701 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1702 1703 if (isa<TruncInst>(EntryVal)) 1704 addMetadata(LastInduction, EntryVal); 1705 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1706 1707 LastInduction = cast<Instruction>(addFastMathFlag( 1708 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1709 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1710 } 1711 1712 // Move the last step to the end of the latch block. This ensures consistent 1713 // placement of all induction updates. 1714 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1715 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1716 auto *ICmp = cast<Instruction>(Br->getCondition()); 1717 LastInduction->moveBefore(ICmp); 1718 LastInduction->setName("vec.ind.next"); 1719 1720 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1721 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1722 } 1723 1724 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1725 return Cost->isScalarAfterVectorization(I, VF) || 1726 Cost->isProfitableToScalarize(I, VF); 1727 } 1728 1729 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1730 if (shouldScalarizeInstruction(IV)) 1731 return true; 1732 auto isScalarInst = [&](User *U) -> bool { 1733 auto *I = cast<Instruction>(U); 1734 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1735 }; 1736 return llvm::any_of(IV->users(), isScalarInst); 1737 } 1738 1739 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1740 const InductionDescriptor &ID, const Instruction *EntryVal, 1741 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1742 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1743 "Expected either an induction phi-node or a truncate of it!"); 1744 1745 // This induction variable is not the phi from the original loop but the 1746 // newly-created IV based on the proof that casted Phi is equal to the 1747 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1748 // re-uses the same InductionDescriptor that original IV uses but we don't 1749 // have to do any recording in this case - that is done when original IV is 1750 // processed. 1751 if (isa<TruncInst>(EntryVal)) 1752 return; 1753 1754 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1755 if (Casts.empty()) 1756 return; 1757 // Only the first Cast instruction in the Casts vector is of interest. 1758 // The rest of the Casts (if exist) have no uses outside the 1759 // induction update chain itself. 1760 Instruction *CastInst = *Casts.begin(); 1761 if (Lane < UINT_MAX) 1762 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1763 else 1764 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1765 } 1766 1767 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1768 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1769 "Primary induction variable must have an integer type"); 1770 1771 auto II = Legal->getInductionVars()->find(IV); 1772 assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); 1773 1774 auto ID = II->second; 1775 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1776 1777 // The scalar value to broadcast. This will be derived from the canonical 1778 // induction variable. 1779 Value *ScalarIV = nullptr; 1780 1781 // The value from the original loop to which we are mapping the new induction 1782 // variable. 1783 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1784 1785 // True if we have vectorized the induction variable. 1786 auto VectorizedIV = false; 1787 1788 // Determine if we want a scalar version of the induction variable. This is 1789 // true if the induction variable itself is not widened, or if it has at 1790 // least one user in the loop that is not widened. 1791 auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); 1792 1793 // Generate code for the induction step. Note that induction steps are 1794 // required to be loop-invariant 1795 assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && 1796 "Induction step should be loop invariant"); 1797 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1798 Value *Step = nullptr; 1799 if (PSE.getSE()->isSCEVable(IV->getType())) { 1800 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1801 Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), 1802 LoopVectorPreHeader->getTerminator()); 1803 } else { 1804 Step = cast<SCEVUnknown>(ID.getStep())->getValue(); 1805 } 1806 1807 // Try to create a new independent vector induction variable. If we can't 1808 // create the phi node, we will splat the scalar induction variable in each 1809 // loop iteration. 1810 if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) { 1811 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1812 VectorizedIV = true; 1813 } 1814 1815 // If we haven't yet vectorized the induction variable, or if we will create 1816 // a scalar one, we need to define the scalar induction variable and step 1817 // values. If we were given a truncation type, truncate the canonical 1818 // induction variable and step. Otherwise, derive these values from the 1819 // induction descriptor. 1820 if (!VectorizedIV || NeedsScalarIV) { 1821 ScalarIV = Induction; 1822 if (IV != OldInduction) { 1823 ScalarIV = IV->getType()->isIntegerTy() 1824 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1825 : Builder.CreateCast(Instruction::SIToFP, Induction, 1826 IV->getType()); 1827 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1828 ScalarIV->setName("offset.idx"); 1829 } 1830 if (Trunc) { 1831 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1832 assert(Step->getType()->isIntegerTy() && 1833 "Truncation requires an integer step"); 1834 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1835 Step = Builder.CreateTrunc(Step, TruncType); 1836 } 1837 } 1838 1839 // If we haven't yet vectorized the induction variable, splat the scalar 1840 // induction variable, and build the necessary step vectors. 1841 // TODO: Don't do it unless the vectorized IV is really required. 1842 if (!VectorizedIV) { 1843 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1844 for (unsigned Part = 0; Part < UF; ++Part) { 1845 Value *EntryPart = 1846 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1847 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1848 if (Trunc) 1849 addMetadata(EntryPart, Trunc); 1850 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1851 } 1852 } 1853 1854 // If an induction variable is only used for counting loop iterations or 1855 // calculating addresses, it doesn't need to be widened. Create scalar steps 1856 // that can be used by instructions we will later scalarize. Note that the 1857 // addition of the scalar steps will not increase the number of instructions 1858 // in the loop in the common case prior to InstCombine. We will be trading 1859 // one vector extract for each scalar step. 1860 if (NeedsScalarIV) 1861 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1862 } 1863 1864 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1865 Instruction::BinaryOps BinOp) { 1866 // Create and check the types. 1867 assert(Val->getType()->isVectorTy() && "Must be a vector"); 1868 int VLen = Val->getType()->getVectorNumElements(); 1869 1870 Type *STy = Val->getType()->getScalarType(); 1871 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1872 "Induction Step must be an integer or FP"); 1873 assert(Step->getType() == STy && "Step has wrong type"); 1874 1875 SmallVector<Constant *, 8> Indices; 1876 1877 if (STy->isIntegerTy()) { 1878 // Create a vector of consecutive numbers from zero to VF. 1879 for (int i = 0; i < VLen; ++i) 1880 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1881 1882 // Add the consecutive indices to the vector value. 1883 Constant *Cv = ConstantVector::get(Indices); 1884 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1885 Step = Builder.CreateVectorSplat(VLen, Step); 1886 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1887 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1888 // which can be found from the original scalar operations. 1889 Step = Builder.CreateMul(Cv, Step); 1890 return Builder.CreateAdd(Val, Step, "induction"); 1891 } 1892 1893 // Floating point induction. 1894 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1895 "Binary Opcode should be specified for FP induction"); 1896 // Create a vector of consecutive numbers from zero to VF. 1897 for (int i = 0; i < VLen; ++i) 1898 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1899 1900 // Add the consecutive indices to the vector value. 1901 Constant *Cv = ConstantVector::get(Indices); 1902 1903 Step = Builder.CreateVectorSplat(VLen, Step); 1904 1905 // Floating point operations had to be 'fast' to enable the induction. 1906 FastMathFlags Flags; 1907 Flags.setFast(); 1908 1909 Value *MulOp = Builder.CreateFMul(Cv, Step); 1910 if (isa<Instruction>(MulOp)) 1911 // Have to check, MulOp may be a constant 1912 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1913 1914 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1915 if (isa<Instruction>(BOp)) 1916 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1917 return BOp; 1918 } 1919 1920 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1921 Instruction *EntryVal, 1922 const InductionDescriptor &ID) { 1923 // We shouldn't have to build scalar steps if we aren't vectorizing. 1924 assert(VF > 1 && "VF should be greater than one"); 1925 1926 // Get the value type and ensure it and the step have the same integer type. 1927 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1928 assert(ScalarIVTy == Step->getType() && 1929 "Val and Step should have the same type"); 1930 1931 // We build scalar steps for both integer and floating-point induction 1932 // variables. Here, we determine the kind of arithmetic we will perform. 1933 Instruction::BinaryOps AddOp; 1934 Instruction::BinaryOps MulOp; 1935 if (ScalarIVTy->isIntegerTy()) { 1936 AddOp = Instruction::Add; 1937 MulOp = Instruction::Mul; 1938 } else { 1939 AddOp = ID.getInductionOpcode(); 1940 MulOp = Instruction::FMul; 1941 } 1942 1943 // Determine the number of scalars we need to generate for each unroll 1944 // iteration. If EntryVal is uniform, we only need to generate the first 1945 // lane. Otherwise, we generate all VF values. 1946 unsigned Lanes = 1947 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 1948 : VF; 1949 // Compute the scalar steps and save the results in VectorLoopValueMap. 1950 for (unsigned Part = 0; Part < UF; ++Part) { 1951 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 1952 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 1953 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 1954 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 1955 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 1956 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 1957 } 1958 } 1959 } 1960 1961 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 1962 assert(V != Induction && "The new induction variable should not be used."); 1963 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 1964 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 1965 1966 // If we have a stride that is replaced by one, do it here. Defer this for 1967 // the VPlan-native path until we start running Legal checks in that path. 1968 if (!EnableVPlanNativePath && Legal->hasStride(V)) 1969 V = ConstantInt::get(V->getType(), 1); 1970 1971 // If we have a vector mapped to this value, return it. 1972 if (VectorLoopValueMap.hasVectorValue(V, Part)) 1973 return VectorLoopValueMap.getVectorValue(V, Part); 1974 1975 // If the value has not been vectorized, check if it has been scalarized 1976 // instead. If it has been scalarized, and we actually need the value in 1977 // vector form, we will construct the vector values on demand. 1978 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 1979 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 1980 1981 // If we've scalarized a value, that value should be an instruction. 1982 auto *I = cast<Instruction>(V); 1983 1984 // If we aren't vectorizing, we can just copy the scalar map values over to 1985 // the vector map. 1986 if (VF == 1) { 1987 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 1988 return ScalarValue; 1989 } 1990 1991 // Get the last scalar instruction we generated for V and Part. If the value 1992 // is known to be uniform after vectorization, this corresponds to lane zero 1993 // of the Part unroll iteration. Otherwise, the last instruction is the one 1994 // we created for the last vector lane of the Part unroll iteration. 1995 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 1996 auto *LastInst = cast<Instruction>( 1997 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 1998 1999 // Set the insert point after the last scalarized instruction. This ensures 2000 // the insertelement sequence will directly follow the scalar definitions. 2001 auto OldIP = Builder.saveIP(); 2002 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2003 Builder.SetInsertPoint(&*NewIP); 2004 2005 // However, if we are vectorizing, we need to construct the vector values. 2006 // If the value is known to be uniform after vectorization, we can just 2007 // broadcast the scalar value corresponding to lane zero for each unroll 2008 // iteration. Otherwise, we construct the vector values using insertelement 2009 // instructions. Since the resulting vectors are stored in 2010 // VectorLoopValueMap, we will only generate the insertelements once. 2011 Value *VectorValue = nullptr; 2012 if (Cost->isUniformAfterVectorization(I, VF)) { 2013 VectorValue = getBroadcastInstrs(ScalarValue); 2014 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2015 } else { 2016 // Initialize packing with insertelements to start from undef. 2017 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2018 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2019 for (unsigned Lane = 0; Lane < VF; ++Lane) 2020 packScalarIntoVectorValue(V, {Part, Lane}); 2021 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2022 } 2023 Builder.restoreIP(OldIP); 2024 return VectorValue; 2025 } 2026 2027 // If this scalar is unknown, assume that it is a constant or that it is 2028 // loop invariant. Broadcast V and save the value for future uses. 2029 Value *B = getBroadcastInstrs(V); 2030 VectorLoopValueMap.setVectorValue(V, Part, B); 2031 return B; 2032 } 2033 2034 Value * 2035 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2036 const VPIteration &Instance) { 2037 // If the value is not an instruction contained in the loop, it should 2038 // already be scalar. 2039 if (OrigLoop->isLoopInvariant(V)) 2040 return V; 2041 2042 assert(Instance.Lane > 0 2043 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2044 : true && "Uniform values only have lane zero"); 2045 2046 // If the value from the original loop has not been vectorized, it is 2047 // represented by UF x VF scalar values in the new loop. Return the requested 2048 // scalar value. 2049 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2050 return VectorLoopValueMap.getScalarValue(V, Instance); 2051 2052 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2053 // for the given unroll part. If this entry is not a vector type (i.e., the 2054 // vectorization factor is one), there is no need to generate an 2055 // extractelement instruction. 2056 auto *U = getOrCreateVectorValue(V, Instance.Part); 2057 if (!U->getType()->isVectorTy()) { 2058 assert(VF == 1 && "Value not scalarized has non-vector type"); 2059 return U; 2060 } 2061 2062 // Otherwise, the value from the original loop has been vectorized and is 2063 // represented by UF vector values. Extract and return the requested scalar 2064 // value from the appropriate vector lane. 2065 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2066 } 2067 2068 void InnerLoopVectorizer::packScalarIntoVectorValue( 2069 Value *V, const VPIteration &Instance) { 2070 assert(V != Induction && "The new induction variable should not be used."); 2071 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2072 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2073 2074 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2075 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2076 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2077 Builder.getInt32(Instance.Lane)); 2078 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2079 } 2080 2081 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2082 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2083 SmallVector<Constant *, 8> ShuffleMask; 2084 for (unsigned i = 0; i < VF; ++i) 2085 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 2086 2087 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2088 ConstantVector::get(ShuffleMask), 2089 "reverse"); 2090 } 2091 2092 // Return whether we allow using masked interleave-groups (for dealing with 2093 // strided loads/stores that reside in predicated blocks, or for dealing 2094 // with gaps). 2095 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2096 // If an override option has been passed in for interleaved accesses, use it. 2097 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2098 return EnableMaskedInterleavedMemAccesses; 2099 2100 return TTI.enableMaskedInterleavedAccessVectorization(); 2101 } 2102 2103 // Try to vectorize the interleave group that \p Instr belongs to. 2104 // 2105 // E.g. Translate following interleaved load group (factor = 3): 2106 // for (i = 0; i < N; i+=3) { 2107 // R = Pic[i]; // Member of index 0 2108 // G = Pic[i+1]; // Member of index 1 2109 // B = Pic[i+2]; // Member of index 2 2110 // ... // do something to R, G, B 2111 // } 2112 // To: 2113 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2114 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2115 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2116 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2117 // 2118 // Or translate following interleaved store group (factor = 3): 2119 // for (i = 0; i < N; i+=3) { 2120 // ... do something to R, G, B 2121 // Pic[i] = R; // Member of index 0 2122 // Pic[i+1] = G; // Member of index 1 2123 // Pic[i+2] = B; // Member of index 2 2124 // } 2125 // To: 2126 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2127 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2128 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2129 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2130 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2131 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, 2132 VectorParts *BlockInMask) { 2133 const InterleaveGroup<Instruction> *Group = 2134 Cost->getInterleavedAccessGroup(Instr); 2135 assert(Group && "Fail to get an interleaved access group."); 2136 2137 // Skip if current instruction is not the insert position. 2138 if (Instr != Group->getInsertPos()) 2139 return; 2140 2141 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2142 Value *Ptr = getLoadStorePointerOperand(Instr); 2143 2144 // Prepare for the vector type of the interleaved load/store. 2145 Type *ScalarTy = getMemInstValueType(Instr); 2146 unsigned InterleaveFactor = Group->getFactor(); 2147 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2148 Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr)); 2149 2150 // Prepare for the new pointers. 2151 setDebugLocFromInst(Builder, Ptr); 2152 SmallVector<Value *, 2> NewPtrs; 2153 unsigned Index = Group->getIndex(Instr); 2154 2155 VectorParts Mask; 2156 bool IsMaskForCondRequired = BlockInMask; 2157 if (IsMaskForCondRequired) { 2158 Mask = *BlockInMask; 2159 // TODO: extend the masked interleaved-group support to reversed access. 2160 assert(!Group->isReverse() && "Reversed masked interleave-group " 2161 "not supported."); 2162 } 2163 2164 // If the group is reverse, adjust the index to refer to the last vector lane 2165 // instead of the first. We adjust the index from the first vector lane, 2166 // rather than directly getting the pointer for lane VF - 1, because the 2167 // pointer operand of the interleaved access is supposed to be uniform. For 2168 // uniform instructions, we're only required to generate a value for the 2169 // first vector lane in each unroll iteration. 2170 if (Group->isReverse()) 2171 Index += (VF - 1) * Group->getFactor(); 2172 2173 bool InBounds = false; 2174 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2175 InBounds = gep->isInBounds(); 2176 2177 for (unsigned Part = 0; Part < UF; Part++) { 2178 Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); 2179 2180 // Notice current instruction could be any index. Need to adjust the address 2181 // to the member of index 0. 2182 // 2183 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2184 // b = A[i]; // Member of index 0 2185 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2186 // 2187 // E.g. A[i+1] = a; // Member of index 1 2188 // A[i] = b; // Member of index 0 2189 // A[i+2] = c; // Member of index 2 (Current instruction) 2190 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2191 NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index)); 2192 if (InBounds) 2193 cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true); 2194 2195 // Cast to the vector pointer type. 2196 NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); 2197 } 2198 2199 setDebugLocFromInst(Builder, Instr); 2200 Value *UndefVec = UndefValue::get(VecTy); 2201 2202 Value *MaskForGaps = nullptr; 2203 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2204 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2205 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2206 } 2207 2208 // Vectorize the interleaved load group. 2209 if (isa<LoadInst>(Instr)) { 2210 // For each unroll part, create a wide load for the group. 2211 SmallVector<Value *, 2> NewLoads; 2212 for (unsigned Part = 0; Part < UF; Part++) { 2213 Instruction *NewLoad; 2214 if (IsMaskForCondRequired || MaskForGaps) { 2215 assert(useMaskedInterleavedAccesses(*TTI) && 2216 "masked interleaved groups are not allowed."); 2217 Value *GroupMask = MaskForGaps; 2218 if (IsMaskForCondRequired) { 2219 auto *Undefs = UndefValue::get(Mask[Part]->getType()); 2220 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2221 Value *ShuffledMask = Builder.CreateShuffleVector( 2222 Mask[Part], Undefs, RepMask, "interleaved.mask"); 2223 GroupMask = MaskForGaps 2224 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2225 MaskForGaps) 2226 : ShuffledMask; 2227 } 2228 NewLoad = 2229 Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), 2230 GroupMask, UndefVec, "wide.masked.vec"); 2231 } 2232 else 2233 NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part], 2234 Group->getAlignment(), "wide.vec"); 2235 Group->addMetadata(NewLoad); 2236 NewLoads.push_back(NewLoad); 2237 } 2238 2239 // For each member in the group, shuffle out the appropriate data from the 2240 // wide loads. 2241 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2242 Instruction *Member = Group->getMember(I); 2243 2244 // Skip the gaps in the group. 2245 if (!Member) 2246 continue; 2247 2248 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); 2249 for (unsigned Part = 0; Part < UF; Part++) { 2250 Value *StridedVec = Builder.CreateShuffleVector( 2251 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2252 2253 // If this member has different type, cast the result type. 2254 if (Member->getType() != ScalarTy) { 2255 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2256 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2257 } 2258 2259 if (Group->isReverse()) 2260 StridedVec = reverseVector(StridedVec); 2261 2262 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2263 } 2264 } 2265 return; 2266 } 2267 2268 // The sub vector type for current instruction. 2269 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2270 2271 // Vectorize the interleaved store group. 2272 for (unsigned Part = 0; Part < UF; Part++) { 2273 // Collect the stored vector from each member. 2274 SmallVector<Value *, 4> StoredVecs; 2275 for (unsigned i = 0; i < InterleaveFactor; i++) { 2276 // Interleaved store group doesn't allow a gap, so each index has a member 2277 Instruction *Member = Group->getMember(i); 2278 assert(Member && "Fail to get a member from an interleaved store group"); 2279 2280 Value *StoredVec = getOrCreateVectorValue( 2281 cast<StoreInst>(Member)->getValueOperand(), Part); 2282 if (Group->isReverse()) 2283 StoredVec = reverseVector(StoredVec); 2284 2285 // If this member has different type, cast it to a unified type. 2286 2287 if (StoredVec->getType() != SubVT) 2288 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2289 2290 StoredVecs.push_back(StoredVec); 2291 } 2292 2293 // Concatenate all vectors into a wide vector. 2294 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2295 2296 // Interleave the elements in the wide vector. 2297 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); 2298 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, 2299 "interleaved.vec"); 2300 2301 Instruction *NewStoreInstr; 2302 if (IsMaskForCondRequired) { 2303 auto *Undefs = UndefValue::get(Mask[Part]->getType()); 2304 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2305 Value *ShuffledMask = Builder.CreateShuffleVector( 2306 Mask[Part], Undefs, RepMask, "interleaved.mask"); 2307 NewStoreInstr = Builder.CreateMaskedStore( 2308 IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask); 2309 } 2310 else 2311 NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], 2312 Group->getAlignment()); 2313 2314 Group->addMetadata(NewStoreInstr); 2315 } 2316 } 2317 2318 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2319 VectorParts *BlockInMask) { 2320 // Attempt to issue a wide load. 2321 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2322 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2323 2324 assert((LI || SI) && "Invalid Load/Store instruction"); 2325 2326 LoopVectorizationCostModel::InstWidening Decision = 2327 Cost->getWideningDecision(Instr, VF); 2328 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 2329 "CM decision should be taken at this point"); 2330 if (Decision == LoopVectorizationCostModel::CM_Interleave) 2331 return vectorizeInterleaveGroup(Instr); 2332 2333 Type *ScalarDataTy = getMemInstValueType(Instr); 2334 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2335 Value *Ptr = getLoadStorePointerOperand(Instr); 2336 unsigned Alignment = getLoadStoreAlignment(Instr); 2337 // An alignment of 0 means target abi alignment. We need to use the scalar's 2338 // target abi alignment in such a case. 2339 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2340 if (!Alignment) 2341 Alignment = DL.getABITypeAlignment(ScalarDataTy); 2342 unsigned AddressSpace = getLoadStoreAddressSpace(Instr); 2343 2344 // Determine if the pointer operand of the access is either consecutive or 2345 // reverse consecutive. 2346 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2347 bool ConsecutiveStride = 2348 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2349 bool CreateGatherScatter = 2350 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2351 2352 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2353 // gather/scatter. Otherwise Decision should have been to Scalarize. 2354 assert((ConsecutiveStride || CreateGatherScatter) && 2355 "The instruction should be scalarized"); 2356 2357 // Handle consecutive loads/stores. 2358 if (ConsecutiveStride) 2359 Ptr = getOrCreateScalarValue(Ptr, {0, 0}); 2360 2361 VectorParts Mask; 2362 bool isMaskRequired = BlockInMask; 2363 if (isMaskRequired) 2364 Mask = *BlockInMask; 2365 2366 bool InBounds = false; 2367 if (auto *gep = dyn_cast<GetElementPtrInst>( 2368 getLoadStorePointerOperand(Instr)->stripPointerCasts())) 2369 InBounds = gep->isInBounds(); 2370 2371 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2372 // Calculate the pointer for the specific unroll-part. 2373 GetElementPtrInst *PartPtr = nullptr; 2374 2375 if (Reverse) { 2376 // If the address is consecutive but reversed, then the 2377 // wide store needs to start at the last vector element. 2378 PartPtr = cast<GetElementPtrInst>( 2379 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2380 PartPtr->setIsInBounds(InBounds); 2381 PartPtr = cast<GetElementPtrInst>( 2382 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2383 PartPtr->setIsInBounds(InBounds); 2384 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2385 Mask[Part] = reverseVector(Mask[Part]); 2386 } else { 2387 PartPtr = cast<GetElementPtrInst>( 2388 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2389 PartPtr->setIsInBounds(InBounds); 2390 } 2391 2392 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2393 }; 2394 2395 // Handle Stores: 2396 if (SI) { 2397 setDebugLocFromInst(Builder, SI); 2398 2399 for (unsigned Part = 0; Part < UF; ++Part) { 2400 Instruction *NewSI = nullptr; 2401 Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); 2402 if (CreateGatherScatter) { 2403 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; 2404 Value *VectorGep = getOrCreateVectorValue(Ptr, Part); 2405 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2406 MaskPart); 2407 } else { 2408 if (Reverse) { 2409 // If we store to reverse consecutive memory locations, then we need 2410 // to reverse the order of elements in the stored value. 2411 StoredVal = reverseVector(StoredVal); 2412 // We don't want to update the value in the map as it might be used in 2413 // another expression. So don't call resetVectorValue(StoredVal). 2414 } 2415 auto *VecPtr = CreateVecPtr(Part, Ptr); 2416 if (isMaskRequired) 2417 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2418 Mask[Part]); 2419 else 2420 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2421 } 2422 addMetadata(NewSI, SI); 2423 } 2424 return; 2425 } 2426 2427 // Handle loads. 2428 assert(LI && "Must have a load instruction"); 2429 setDebugLocFromInst(Builder, LI); 2430 for (unsigned Part = 0; Part < UF; ++Part) { 2431 Value *NewLI; 2432 if (CreateGatherScatter) { 2433 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; 2434 Value *VectorGep = getOrCreateVectorValue(Ptr, Part); 2435 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2436 nullptr, "wide.masked.gather"); 2437 addMetadata(NewLI, LI); 2438 } else { 2439 auto *VecPtr = CreateVecPtr(Part, Ptr); 2440 if (isMaskRequired) 2441 NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], 2442 UndefValue::get(DataTy), 2443 "wide.masked.load"); 2444 else 2445 NewLI = 2446 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2447 2448 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2449 addMetadata(NewLI, LI); 2450 if (Reverse) 2451 NewLI = reverseVector(NewLI); 2452 } 2453 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2454 } 2455 } 2456 2457 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2458 const VPIteration &Instance, 2459 bool IfPredicateInstr) { 2460 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2461 2462 setDebugLocFromInst(Builder, Instr); 2463 2464 // Does this instruction return a value ? 2465 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2466 2467 Instruction *Cloned = Instr->clone(); 2468 if (!IsVoidRetTy) 2469 Cloned->setName(Instr->getName() + ".cloned"); 2470 2471 // Replace the operands of the cloned instructions with their scalar 2472 // equivalents in the new loop. 2473 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2474 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); 2475 Cloned->setOperand(op, NewOp); 2476 } 2477 addNewMetadata(Cloned, Instr); 2478 2479 // Place the cloned scalar in the new loop. 2480 Builder.Insert(Cloned); 2481 2482 // Add the cloned scalar to the scalar map entry. 2483 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2484 2485 // If we just cloned a new assumption, add it the assumption cache. 2486 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2487 if (II->getIntrinsicID() == Intrinsic::assume) 2488 AC->registerAssumption(II); 2489 2490 // End if-block. 2491 if (IfPredicateInstr) 2492 PredicatedInstructions.push_back(Cloned); 2493 } 2494 2495 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2496 Value *End, Value *Step, 2497 Instruction *DL) { 2498 BasicBlock *Header = L->getHeader(); 2499 BasicBlock *Latch = L->getLoopLatch(); 2500 // As we're just creating this loop, it's possible no latch exists 2501 // yet. If so, use the header as this will be a single block loop. 2502 if (!Latch) 2503 Latch = Header; 2504 2505 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2506 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2507 setDebugLocFromInst(Builder, OldInst); 2508 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2509 2510 Builder.SetInsertPoint(Latch->getTerminator()); 2511 setDebugLocFromInst(Builder, OldInst); 2512 2513 // Create i+1 and fill the PHINode. 2514 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2515 Induction->addIncoming(Start, L->getLoopPreheader()); 2516 Induction->addIncoming(Next, Latch); 2517 // Create the compare. 2518 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2519 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2520 2521 // Now we have two terminators. Remove the old one from the block. 2522 Latch->getTerminator()->eraseFromParent(); 2523 2524 return Induction; 2525 } 2526 2527 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2528 if (TripCount) 2529 return TripCount; 2530 2531 assert(L && "Create Trip Count for null loop."); 2532 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2533 // Find the loop boundaries. 2534 ScalarEvolution *SE = PSE.getSE(); 2535 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2536 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2537 "Invalid loop count"); 2538 2539 Type *IdxTy = Legal->getWidestInductionType(); 2540 assert(IdxTy && "No type for induction"); 2541 2542 // The exit count might have the type of i64 while the phi is i32. This can 2543 // happen if we have an induction variable that is sign extended before the 2544 // compare. The only way that we get a backedge taken count is that the 2545 // induction variable was signed and as such will not overflow. In such a case 2546 // truncation is legal. 2547 if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > 2548 IdxTy->getPrimitiveSizeInBits()) 2549 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2550 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2551 2552 // Get the total trip count from the count by adding 1. 2553 const SCEV *ExitCount = SE->getAddExpr( 2554 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2555 2556 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2557 2558 // Expand the trip count and place the new instructions in the preheader. 2559 // Notice that the pre-header does not change, only the loop body. 2560 SCEVExpander Exp(*SE, DL, "induction"); 2561 2562 // Count holds the overall loop count (N). 2563 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2564 L->getLoopPreheader()->getTerminator()); 2565 2566 if (TripCount->getType()->isPointerTy()) 2567 TripCount = 2568 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2569 L->getLoopPreheader()->getTerminator()); 2570 2571 return TripCount; 2572 } 2573 2574 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2575 if (VectorTripCount) 2576 return VectorTripCount; 2577 2578 Value *TC = getOrCreateTripCount(L); 2579 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2580 2581 Type *Ty = TC->getType(); 2582 Constant *Step = ConstantInt::get(Ty, VF * UF); 2583 2584 // If the tail is to be folded by masking, round the number of iterations N 2585 // up to a multiple of Step instead of rounding down. This is done by first 2586 // adding Step-1 and then rounding down. Note that it's ok if this addition 2587 // overflows: the vector induction variable will eventually wrap to zero given 2588 // that it starts at zero and its Step is a power of two; the loop will then 2589 // exit, with the last early-exit vector comparison also producing all-true. 2590 if (Cost->foldTailByMasking()) { 2591 assert(isPowerOf2_32(VF * UF) && 2592 "VF*UF must be a power of 2 when folding tail by masking"); 2593 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2594 } 2595 2596 // Now we need to generate the expression for the part of the loop that the 2597 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2598 // iterations are not required for correctness, or N - Step, otherwise. Step 2599 // is equal to the vectorization factor (number of SIMD elements) times the 2600 // unroll factor (number of SIMD instructions). 2601 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2602 2603 // If there is a non-reversed interleaved group that may speculatively access 2604 // memory out-of-bounds, we need to ensure that there will be at least one 2605 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2606 // the trip count, we set the remainder to be equal to the step. If the step 2607 // does not evenly divide the trip count, no adjustment is necessary since 2608 // there will already be scalar iterations. Note that the minimum iterations 2609 // check ensures that N >= Step. 2610 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2611 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2612 R = Builder.CreateSelect(IsZero, Step, R); 2613 } 2614 2615 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2616 2617 return VectorTripCount; 2618 } 2619 2620 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2621 const DataLayout &DL) { 2622 // Verify that V is a vector type with same number of elements as DstVTy. 2623 unsigned VF = DstVTy->getNumElements(); 2624 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2625 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2626 Type *SrcElemTy = SrcVecTy->getElementType(); 2627 Type *DstElemTy = DstVTy->getElementType(); 2628 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2629 "Vector elements must have same size"); 2630 2631 // Do a direct cast if element types are castable. 2632 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2633 return Builder.CreateBitOrPointerCast(V, DstVTy); 2634 } 2635 // V cannot be directly casted to desired vector type. 2636 // May happen when V is a floating point vector but DstVTy is a vector of 2637 // pointers or vice-versa. Handle this using a two-step bitcast using an 2638 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2639 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2640 "Only one type should be a pointer type"); 2641 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2642 "Only one type should be a floating point type"); 2643 Type *IntTy = 2644 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2645 VectorType *VecIntTy = VectorType::get(IntTy, VF); 2646 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2647 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2648 } 2649 2650 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2651 BasicBlock *Bypass) { 2652 Value *Count = getOrCreateTripCount(L); 2653 BasicBlock *BB = L->getLoopPreheader(); 2654 IRBuilder<> Builder(BB->getTerminator()); 2655 2656 // Generate code to check if the loop's trip count is less than VF * UF, or 2657 // equal to it in case a scalar epilogue is required; this implies that the 2658 // vector trip count is zero. This check also covers the case where adding one 2659 // to the backedge-taken count overflowed leading to an incorrect trip count 2660 // of zero. In this case we will also jump to the scalar loop. 2661 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2662 : ICmpInst::ICMP_ULT; 2663 2664 // If tail is to be folded, vector loop takes care of all iterations. 2665 Value *CheckMinIters = Builder.getFalse(); 2666 if (!Cost->foldTailByMasking()) 2667 CheckMinIters = Builder.CreateICmp( 2668 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2669 "min.iters.check"); 2670 2671 BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2672 // Update dominator tree immediately if the generated block is a 2673 // LoopBypassBlock because SCEV expansions to generate loop bypass 2674 // checks may query it before the current function is finished. 2675 DT->addNewBlock(NewBB, BB); 2676 if (L->getParentLoop()) 2677 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2678 ReplaceInstWithInst(BB->getTerminator(), 2679 BranchInst::Create(Bypass, NewBB, CheckMinIters)); 2680 LoopBypassBlocks.push_back(BB); 2681 } 2682 2683 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2684 BasicBlock *BB = L->getLoopPreheader(); 2685 2686 // Generate the code to check that the SCEV assumptions that we made. 2687 // We want the new basic block to start at the first instruction in a 2688 // sequence of instructions that form a check. 2689 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2690 "scev.check"); 2691 Value *SCEVCheck = 2692 Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); 2693 2694 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2695 if (C->isZero()) 2696 return; 2697 2698 assert(!Cost->foldTailByMasking() && 2699 "Cannot SCEV check stride or overflow when folding tail"); 2700 // Create a new block containing the stride check. 2701 BB->setName("vector.scevcheck"); 2702 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2703 // Update dominator tree immediately if the generated block is a 2704 // LoopBypassBlock because SCEV expansions to generate loop bypass 2705 // checks may query it before the current function is finished. 2706 DT->addNewBlock(NewBB, BB); 2707 if (L->getParentLoop()) 2708 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2709 ReplaceInstWithInst(BB->getTerminator(), 2710 BranchInst::Create(Bypass, NewBB, SCEVCheck)); 2711 LoopBypassBlocks.push_back(BB); 2712 AddedSafetyChecks = true; 2713 } 2714 2715 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2716 // VPlan-native path does not do any analysis for runtime checks currently. 2717 if (EnableVPlanNativePath) 2718 return; 2719 2720 BasicBlock *BB = L->getLoopPreheader(); 2721 2722 // Generate the code that checks in runtime if arrays overlap. We put the 2723 // checks into a separate block to make the more common case of few elements 2724 // faster. 2725 Instruction *FirstCheckInst; 2726 Instruction *MemRuntimeCheck; 2727 std::tie(FirstCheckInst, MemRuntimeCheck) = 2728 Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); 2729 if (!MemRuntimeCheck) 2730 return; 2731 2732 assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail"); 2733 // Create a new block containing the memory check. 2734 BB->setName("vector.memcheck"); 2735 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2736 // Update dominator tree immediately if the generated block is a 2737 // LoopBypassBlock because SCEV expansions to generate loop bypass 2738 // checks may query it before the current function is finished. 2739 DT->addNewBlock(NewBB, BB); 2740 if (L->getParentLoop()) 2741 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2742 ReplaceInstWithInst(BB->getTerminator(), 2743 BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); 2744 LoopBypassBlocks.push_back(BB); 2745 AddedSafetyChecks = true; 2746 2747 // We currently don't use LoopVersioning for the actual loop cloning but we 2748 // still use it to add the noalias metadata. 2749 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2750 PSE.getSE()); 2751 LVer->prepareNoAliasMetadata(); 2752 } 2753 2754 Value *InnerLoopVectorizer::emitTransformedIndex( 2755 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2756 const InductionDescriptor &ID) const { 2757 2758 SCEVExpander Exp(*SE, DL, "induction"); 2759 auto Step = ID.getStep(); 2760 auto StartValue = ID.getStartValue(); 2761 assert(Index->getType() == Step->getType() && 2762 "Index type does not match StepValue type"); 2763 2764 // Note: the IR at this point is broken. We cannot use SE to create any new 2765 // SCEV and then expand it, hoping that SCEV's simplification will give us 2766 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2767 // lead to various SCEV crashes. So all we can do is to use builder and rely 2768 // on InstCombine for future simplifications. Here we handle some trivial 2769 // cases only. 2770 auto CreateAdd = [&B](Value *X, Value *Y) { 2771 assert(X->getType() == Y->getType() && "Types don't match!"); 2772 if (auto *CX = dyn_cast<ConstantInt>(X)) 2773 if (CX->isZero()) 2774 return Y; 2775 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2776 if (CY->isZero()) 2777 return X; 2778 return B.CreateAdd(X, Y); 2779 }; 2780 2781 auto CreateMul = [&B](Value *X, Value *Y) { 2782 assert(X->getType() == Y->getType() && "Types don't match!"); 2783 if (auto *CX = dyn_cast<ConstantInt>(X)) 2784 if (CX->isOne()) 2785 return Y; 2786 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2787 if (CY->isOne()) 2788 return X; 2789 return B.CreateMul(X, Y); 2790 }; 2791 2792 switch (ID.getKind()) { 2793 case InductionDescriptor::IK_IntInduction: { 2794 assert(Index->getType() == StartValue->getType() && 2795 "Index type does not match StartValue type"); 2796 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2797 return B.CreateSub(StartValue, Index); 2798 auto *Offset = CreateMul( 2799 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2800 return CreateAdd(StartValue, Offset); 2801 } 2802 case InductionDescriptor::IK_PtrInduction: { 2803 assert(isa<SCEVConstant>(Step) && 2804 "Expected constant step for pointer induction"); 2805 return B.CreateGEP( 2806 StartValue->getType()->getPointerElementType(), StartValue, 2807 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2808 &*B.GetInsertPoint()))); 2809 } 2810 case InductionDescriptor::IK_FpInduction: { 2811 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2812 auto InductionBinOp = ID.getInductionBinOp(); 2813 assert(InductionBinOp && 2814 (InductionBinOp->getOpcode() == Instruction::FAdd || 2815 InductionBinOp->getOpcode() == Instruction::FSub) && 2816 "Original bin op should be defined for FP induction"); 2817 2818 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2819 2820 // Floating point operations had to be 'fast' to enable the induction. 2821 FastMathFlags Flags; 2822 Flags.setFast(); 2823 2824 Value *MulExp = B.CreateFMul(StepValue, Index); 2825 if (isa<Instruction>(MulExp)) 2826 // We have to check, the MulExp may be a constant. 2827 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2828 2829 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2830 "induction"); 2831 if (isa<Instruction>(BOp)) 2832 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2833 2834 return BOp; 2835 } 2836 case InductionDescriptor::IK_NoInduction: 2837 return nullptr; 2838 } 2839 llvm_unreachable("invalid enum"); 2840 } 2841 2842 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2843 /* 2844 In this function we generate a new loop. The new loop will contain 2845 the vectorized instructions while the old loop will continue to run the 2846 scalar remainder. 2847 2848 [ ] <-- loop iteration number check. 2849 / | 2850 / v 2851 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2852 | / | 2853 | / v 2854 || [ ] <-- vector pre header. 2855 |/ | 2856 | v 2857 | [ ] \ 2858 | [ ]_| <-- vector loop. 2859 | | 2860 | v 2861 | -[ ] <--- middle-block. 2862 | / | 2863 | / v 2864 -|- >[ ] <--- new preheader. 2865 | | 2866 | v 2867 | [ ] \ 2868 | [ ]_| <-- old scalar loop to handle remainder. 2869 \ | 2870 \ v 2871 >[ ] <-- exit block. 2872 ... 2873 */ 2874 2875 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 2876 BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); 2877 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 2878 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2879 assert(VectorPH && "Invalid loop structure"); 2880 assert(ExitBlock && "Must have an exit block"); 2881 2882 // Some loops have a single integer induction variable, while other loops 2883 // don't. One example is c++ iterators that often have multiple pointer 2884 // induction variables. In the code below we also support a case where we 2885 // don't have a single induction variable. 2886 // 2887 // We try to obtain an induction variable from the original loop as hard 2888 // as possible. However if we don't find one that: 2889 // - is an integer 2890 // - counts from zero, stepping by one 2891 // - is the size of the widest induction variable type 2892 // then we create a new one. 2893 OldInduction = Legal->getPrimaryInduction(); 2894 Type *IdxTy = Legal->getWidestInductionType(); 2895 2896 // Split the single block loop into the two loop structure described above. 2897 BasicBlock *VecBody = 2898 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 2899 BasicBlock *MiddleBlock = 2900 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 2901 BasicBlock *ScalarPH = 2902 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 2903 2904 // Create and register the new vector loop. 2905 Loop *Lp = LI->AllocateLoop(); 2906 Loop *ParentLoop = OrigLoop->getParentLoop(); 2907 2908 // Insert the new loop into the loop nest and register the new basic blocks 2909 // before calling any utilities such as SCEV that require valid LoopInfo. 2910 if (ParentLoop) { 2911 ParentLoop->addChildLoop(Lp); 2912 ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); 2913 ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); 2914 } else { 2915 LI->addTopLevelLoop(Lp); 2916 } 2917 Lp->addBasicBlockToLoop(VecBody, *LI); 2918 2919 // Find the loop boundaries. 2920 Value *Count = getOrCreateTripCount(Lp); 2921 2922 Value *StartIdx = ConstantInt::get(IdxTy, 0); 2923 2924 // Now, compare the new count to zero. If it is zero skip the vector loop and 2925 // jump to the scalar loop. This check also covers the case where the 2926 // backedge-taken count is uint##_max: adding one to it will overflow leading 2927 // to an incorrect trip count of zero. In this (rare) case we will also jump 2928 // to the scalar loop. 2929 emitMinimumIterationCountCheck(Lp, ScalarPH); 2930 2931 // Generate the code to check any assumptions that we've made for SCEV 2932 // expressions. 2933 emitSCEVChecks(Lp, ScalarPH); 2934 2935 // Generate the code that checks in runtime if arrays overlap. We put the 2936 // checks into a separate block to make the more common case of few elements 2937 // faster. 2938 emitMemRuntimeChecks(Lp, ScalarPH); 2939 2940 // Generate the induction variable. 2941 // The loop step is equal to the vectorization factor (num of SIMD elements) 2942 // times the unroll factor (num of SIMD instructions). 2943 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 2944 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 2945 Induction = 2946 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 2947 getDebugLocFromInstOrOperands(OldInduction)); 2948 2949 // We are going to resume the execution of the scalar loop. 2950 // Go over all of the induction variables that we found and fix the 2951 // PHIs that are left in the scalar version of the loop. 2952 // The starting values of PHI nodes depend on the counter of the last 2953 // iteration in the vectorized loop. 2954 // If we come from a bypass edge then we need to start from the original 2955 // start value. 2956 2957 // This variable saves the new starting index for the scalar loop. It is used 2958 // to test if there are any tail iterations left once the vector loop has 2959 // completed. 2960 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 2961 for (auto &InductionEntry : *List) { 2962 PHINode *OrigPhi = InductionEntry.first; 2963 InductionDescriptor II = InductionEntry.second; 2964 2965 // Create phi nodes to merge from the backedge-taken check block. 2966 PHINode *BCResumeVal = PHINode::Create( 2967 OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); 2968 // Copy original phi DL over to the new one. 2969 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 2970 Value *&EndValue = IVEndValues[OrigPhi]; 2971 if (OrigPhi == OldInduction) { 2972 // We know what the end value is. 2973 EndValue = CountRoundDown; 2974 } else { 2975 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 2976 Type *StepType = II.getStep()->getType(); 2977 Instruction::CastOps CastOp = 2978 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 2979 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 2980 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2981 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 2982 EndValue->setName("ind.end"); 2983 } 2984 2985 // The new PHI merges the original incoming value, in case of a bypass, 2986 // or the value at the end of the vectorized loop. 2987 BCResumeVal->addIncoming(EndValue, MiddleBlock); 2988 2989 // Fix the scalar body counter (PHI node). 2990 // The old induction's phi node in the scalar body needs the truncated 2991 // value. 2992 for (BasicBlock *BB : LoopBypassBlocks) 2993 BCResumeVal->addIncoming(II.getStartValue(), BB); 2994 OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal); 2995 } 2996 2997 // We need the OrigLoop (scalar loop part) latch terminator to help 2998 // produce correct debug info for the middle block BB instructions. 2999 // The legality check stage guarantees that the loop will have a single 3000 // latch. 3001 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3002 "Scalar loop latch terminator isn't a branch"); 3003 BranchInst *ScalarLatchBr = 3004 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3005 3006 // Add a check in the middle block to see if we have completed 3007 // all of the iterations in the first vector loop. 3008 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3009 // If tail is to be folded, we know we don't need to run the remainder. 3010 Value *CmpN = Builder.getTrue(); 3011 if (!Cost->foldTailByMasking()) { 3012 CmpN = 3013 CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3014 CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); 3015 3016 // Here we use the same DebugLoc as the scalar loop latch branch instead 3017 // of the corresponding compare because they may have ended up with 3018 // different line numbers and we want to avoid awkward line stepping while 3019 // debugging. Eg. if the compare has got a line number inside the loop. 3020 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3021 } 3022 3023 BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN); 3024 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3025 ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst); 3026 3027 // Get ready to start creating new instructions into the vectorized body. 3028 Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); 3029 3030 // Save the state. 3031 LoopVectorPreHeader = Lp->getLoopPreheader(); 3032 LoopScalarPreHeader = ScalarPH; 3033 LoopMiddleBlock = MiddleBlock; 3034 LoopExitBlock = ExitBlock; 3035 LoopVectorBody = VecBody; 3036 LoopScalarBody = OldBasicBlock; 3037 3038 Optional<MDNode *> VectorizedLoopID = 3039 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3040 LLVMLoopVectorizeFollowupVectorized}); 3041 if (VectorizedLoopID.hasValue()) { 3042 Lp->setLoopID(VectorizedLoopID.getValue()); 3043 3044 // Do not setAlreadyVectorized if loop attributes have been defined 3045 // explicitly. 3046 return LoopVectorPreHeader; 3047 } 3048 3049 // Keep all loop hints from the original loop on the vector loop (we'll 3050 // replace the vectorizer-specific hints below). 3051 if (MDNode *LID = OrigLoop->getLoopID()) 3052 Lp->setLoopID(LID); 3053 3054 LoopVectorizeHints Hints(Lp, true, *ORE); 3055 Hints.setAlreadyVectorized(); 3056 3057 return LoopVectorPreHeader; 3058 } 3059 3060 // Fix up external users of the induction variable. At this point, we are 3061 // in LCSSA form, with all external PHIs that use the IV having one input value, 3062 // coming from the remainder loop. We need those PHIs to also have a correct 3063 // value for the IV when arriving directly from the middle block. 3064 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3065 const InductionDescriptor &II, 3066 Value *CountRoundDown, Value *EndValue, 3067 BasicBlock *MiddleBlock) { 3068 // There are two kinds of external IV usages - those that use the value 3069 // computed in the last iteration (the PHI) and those that use the penultimate 3070 // value (the value that feeds into the phi from the loop latch). 3071 // We allow both, but they, obviously, have different values. 3072 3073 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3074 3075 DenseMap<Value *, Value *> MissingVals; 3076 3077 // An external user of the last iteration's value should see the value that 3078 // the remainder loop uses to initialize its own IV. 3079 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3080 for (User *U : PostInc->users()) { 3081 Instruction *UI = cast<Instruction>(U); 3082 if (!OrigLoop->contains(UI)) { 3083 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3084 MissingVals[UI] = EndValue; 3085 } 3086 } 3087 3088 // An external user of the penultimate value need to see EndValue - Step. 3089 // The simplest way to get this is to recompute it from the constituent SCEVs, 3090 // that is Start + (Step * (CRD - 1)). 3091 for (User *U : OrigPhi->users()) { 3092 auto *UI = cast<Instruction>(U); 3093 if (!OrigLoop->contains(UI)) { 3094 const DataLayout &DL = 3095 OrigLoop->getHeader()->getModule()->getDataLayout(); 3096 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3097 3098 IRBuilder<> B(MiddleBlock->getTerminator()); 3099 Value *CountMinusOne = B.CreateSub( 3100 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3101 Value *CMO = 3102 !II.getStep()->getType()->isIntegerTy() 3103 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3104 II.getStep()->getType()) 3105 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3106 CMO->setName("cast.cmo"); 3107 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3108 Escape->setName("ind.escape"); 3109 MissingVals[UI] = Escape; 3110 } 3111 } 3112 3113 for (auto &I : MissingVals) { 3114 PHINode *PHI = cast<PHINode>(I.first); 3115 // One corner case we have to handle is two IVs "chasing" each-other, 3116 // that is %IV2 = phi [...], [ %IV1, %latch ] 3117 // In this case, if IV1 has an external use, we need to avoid adding both 3118 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3119 // don't already have an incoming value for the middle block. 3120 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3121 PHI->addIncoming(I.second, MiddleBlock); 3122 } 3123 } 3124 3125 namespace { 3126 3127 struct CSEDenseMapInfo { 3128 static bool canHandle(const Instruction *I) { 3129 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3130 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3131 } 3132 3133 static inline Instruction *getEmptyKey() { 3134 return DenseMapInfo<Instruction *>::getEmptyKey(); 3135 } 3136 3137 static inline Instruction *getTombstoneKey() { 3138 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3139 } 3140 3141 static unsigned getHashValue(const Instruction *I) { 3142 assert(canHandle(I) && "Unknown instruction!"); 3143 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3144 I->value_op_end())); 3145 } 3146 3147 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3148 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3149 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3150 return LHS == RHS; 3151 return LHS->isIdenticalTo(RHS); 3152 } 3153 }; 3154 3155 } // end anonymous namespace 3156 3157 ///Perform cse of induction variable instructions. 3158 static void cse(BasicBlock *BB) { 3159 // Perform simple cse. 3160 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3161 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3162 Instruction *In = &*I++; 3163 3164 if (!CSEDenseMapInfo::canHandle(In)) 3165 continue; 3166 3167 // Check if we can replace this instruction with any of the 3168 // visited instructions. 3169 if (Instruction *V = CSEMap.lookup(In)) { 3170 In->replaceAllUsesWith(V); 3171 In->eraseFromParent(); 3172 continue; 3173 } 3174 3175 CSEMap[In] = In; 3176 } 3177 } 3178 3179 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3180 unsigned VF, 3181 bool &NeedToScalarize) { 3182 Function *F = CI->getCalledFunction(); 3183 StringRef FnName = CI->getCalledFunction()->getName(); 3184 Type *ScalarRetTy = CI->getType(); 3185 SmallVector<Type *, 4> Tys, ScalarTys; 3186 for (auto &ArgOp : CI->arg_operands()) 3187 ScalarTys.push_back(ArgOp->getType()); 3188 3189 // Estimate cost of scalarized vector call. The source operands are assumed 3190 // to be vectors, so we need to extract individual elements from there, 3191 // execute VF scalar calls, and then gather the result into the vector return 3192 // value. 3193 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3194 if (VF == 1) 3195 return ScalarCallCost; 3196 3197 // Compute corresponding vector type for return value and arguments. 3198 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3199 for (Type *ScalarTy : ScalarTys) 3200 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3201 3202 // Compute costs of unpacking argument values for the scalar calls and 3203 // packing the return values to a vector. 3204 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3205 3206 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3207 3208 // If we can't emit a vector call for this function, then the currently found 3209 // cost is the cost we need to return. 3210 NeedToScalarize = true; 3211 if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) 3212 return Cost; 3213 3214 // If the corresponding vector cost is cheaper, return its cost. 3215 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3216 if (VectorCallCost < Cost) { 3217 NeedToScalarize = false; 3218 return VectorCallCost; 3219 } 3220 return Cost; 3221 } 3222 3223 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3224 unsigned VF) { 3225 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3226 assert(ID && "Expected intrinsic call!"); 3227 3228 FastMathFlags FMF; 3229 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3230 FMF = FPMO->getFastMathFlags(); 3231 3232 SmallVector<Value *, 4> Operands(CI->arg_operands()); 3233 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF); 3234 } 3235 3236 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3237 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3238 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3239 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3240 } 3241 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3242 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3243 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3244 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3245 } 3246 3247 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3248 // For every instruction `I` in MinBWs, truncate the operands, create a 3249 // truncated version of `I` and reextend its result. InstCombine runs 3250 // later and will remove any ext/trunc pairs. 3251 SmallPtrSet<Value *, 4> Erased; 3252 for (const auto &KV : Cost->getMinimalBitwidths()) { 3253 // If the value wasn't vectorized, we must maintain the original scalar 3254 // type. The absence of the value from VectorLoopValueMap indicates that it 3255 // wasn't vectorized. 3256 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3257 continue; 3258 for (unsigned Part = 0; Part < UF; ++Part) { 3259 Value *I = getOrCreateVectorValue(KV.first, Part); 3260 if (Erased.find(I) != Erased.end() || I->use_empty() || 3261 !isa<Instruction>(I)) 3262 continue; 3263 Type *OriginalTy = I->getType(); 3264 Type *ScalarTruncatedTy = 3265 IntegerType::get(OriginalTy->getContext(), KV.second); 3266 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, 3267 OriginalTy->getVectorNumElements()); 3268 if (TruncatedTy == OriginalTy) 3269 continue; 3270 3271 IRBuilder<> B(cast<Instruction>(I)); 3272 auto ShrinkOperand = [&](Value *V) -> Value * { 3273 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3274 if (ZI->getSrcTy() == TruncatedTy) 3275 return ZI->getOperand(0); 3276 return B.CreateZExtOrTrunc(V, TruncatedTy); 3277 }; 3278 3279 // The actual instruction modification depends on the instruction type, 3280 // unfortunately. 3281 Value *NewI = nullptr; 3282 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3283 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3284 ShrinkOperand(BO->getOperand(1))); 3285 3286 // Any wrapping introduced by shrinking this operation shouldn't be 3287 // considered undefined behavior. So, we can't unconditionally copy 3288 // arithmetic wrapping flags to NewI. 3289 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3290 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3291 NewI = 3292 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3293 ShrinkOperand(CI->getOperand(1))); 3294 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3295 NewI = B.CreateSelect(SI->getCondition(), 3296 ShrinkOperand(SI->getTrueValue()), 3297 ShrinkOperand(SI->getFalseValue())); 3298 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3299 switch (CI->getOpcode()) { 3300 default: 3301 llvm_unreachable("Unhandled cast!"); 3302 case Instruction::Trunc: 3303 NewI = ShrinkOperand(CI->getOperand(0)); 3304 break; 3305 case Instruction::SExt: 3306 NewI = B.CreateSExtOrTrunc( 3307 CI->getOperand(0), 3308 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3309 break; 3310 case Instruction::ZExt: 3311 NewI = B.CreateZExtOrTrunc( 3312 CI->getOperand(0), 3313 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3314 break; 3315 } 3316 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3317 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); 3318 auto *O0 = B.CreateZExtOrTrunc( 3319 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3320 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); 3321 auto *O1 = B.CreateZExtOrTrunc( 3322 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3323 3324 NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); 3325 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3326 // Don't do anything with the operands, just extend the result. 3327 continue; 3328 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3329 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements(); 3330 auto *O0 = B.CreateZExtOrTrunc( 3331 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3332 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3333 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3334 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3335 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements(); 3336 auto *O0 = B.CreateZExtOrTrunc( 3337 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3338 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3339 } else { 3340 // If we don't know what to do, be conservative and don't do anything. 3341 continue; 3342 } 3343 3344 // Lastly, extend the result. 3345 NewI->takeName(cast<Instruction>(I)); 3346 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3347 I->replaceAllUsesWith(Res); 3348 cast<Instruction>(I)->eraseFromParent(); 3349 Erased.insert(I); 3350 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3351 } 3352 } 3353 3354 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3355 for (const auto &KV : Cost->getMinimalBitwidths()) { 3356 // If the value wasn't vectorized, we must maintain the original scalar 3357 // type. The absence of the value from VectorLoopValueMap indicates that it 3358 // wasn't vectorized. 3359 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3360 continue; 3361 for (unsigned Part = 0; Part < UF; ++Part) { 3362 Value *I = getOrCreateVectorValue(KV.first, Part); 3363 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3364 if (Inst && Inst->use_empty()) { 3365 Value *NewI = Inst->getOperand(0); 3366 Inst->eraseFromParent(); 3367 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3368 } 3369 } 3370 } 3371 } 3372 3373 void InnerLoopVectorizer::fixVectorizedLoop() { 3374 // Insert truncates and extends for any truncated instructions as hints to 3375 // InstCombine. 3376 if (VF > 1) 3377 truncateToMinimalBitwidths(); 3378 3379 // Fix widened non-induction PHIs by setting up the PHI operands. 3380 if (OrigPHIsToFix.size()) { 3381 assert(EnableVPlanNativePath && 3382 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3383 fixNonInductionPHIs(); 3384 } 3385 3386 // At this point every instruction in the original loop is widened to a 3387 // vector form. Now we need to fix the recurrences in the loop. These PHI 3388 // nodes are currently empty because we did not want to introduce cycles. 3389 // This is the second stage of vectorizing recurrences. 3390 fixCrossIterationPHIs(); 3391 3392 // Update the dominator tree. 3393 // 3394 // FIXME: After creating the structure of the new loop, the dominator tree is 3395 // no longer up-to-date, and it remains that way until we update it 3396 // here. An out-of-date dominator tree is problematic for SCEV, 3397 // because SCEVExpander uses it to guide code generation. The 3398 // vectorizer use SCEVExpanders in several places. Instead, we should 3399 // keep the dominator tree up-to-date as we go. 3400 updateAnalysis(); 3401 3402 // Fix-up external users of the induction variables. 3403 for (auto &Entry : *Legal->getInductionVars()) 3404 fixupIVUsers(Entry.first, Entry.second, 3405 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3406 IVEndValues[Entry.first], LoopMiddleBlock); 3407 3408 fixLCSSAPHIs(); 3409 for (Instruction *PI : PredicatedInstructions) 3410 sinkScalarOperands(&*PI); 3411 3412 // Remove redundant induction instructions. 3413 cse(LoopVectorBody); 3414 } 3415 3416 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3417 // In order to support recurrences we need to be able to vectorize Phi nodes. 3418 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3419 // stage #2: We now need to fix the recurrences by adding incoming edges to 3420 // the currently empty PHI nodes. At this point every instruction in the 3421 // original loop is widened to a vector form so we can use them to construct 3422 // the incoming edges. 3423 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3424 // Handle first-order recurrences and reductions that need to be fixed. 3425 if (Legal->isFirstOrderRecurrence(&Phi)) 3426 fixFirstOrderRecurrence(&Phi); 3427 else if (Legal->isReductionVariable(&Phi)) 3428 fixReduction(&Phi); 3429 } 3430 } 3431 3432 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3433 // This is the second phase of vectorizing first-order recurrences. An 3434 // overview of the transformation is described below. Suppose we have the 3435 // following loop. 3436 // 3437 // for (int i = 0; i < n; ++i) 3438 // b[i] = a[i] - a[i - 1]; 3439 // 3440 // There is a first-order recurrence on "a". For this loop, the shorthand 3441 // scalar IR looks like: 3442 // 3443 // scalar.ph: 3444 // s_init = a[-1] 3445 // br scalar.body 3446 // 3447 // scalar.body: 3448 // i = phi [0, scalar.ph], [i+1, scalar.body] 3449 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3450 // s2 = a[i] 3451 // b[i] = s2 - s1 3452 // br cond, scalar.body, ... 3453 // 3454 // In this example, s1 is a recurrence because it's value depends on the 3455 // previous iteration. In the first phase of vectorization, we created a 3456 // temporary value for s1. We now complete the vectorization and produce the 3457 // shorthand vector IR shown below (for VF = 4, UF = 1). 3458 // 3459 // vector.ph: 3460 // v_init = vector(..., ..., ..., a[-1]) 3461 // br vector.body 3462 // 3463 // vector.body 3464 // i = phi [0, vector.ph], [i+4, vector.body] 3465 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3466 // v2 = a[i, i+1, i+2, i+3]; 3467 // v3 = vector(v1(3), v2(0, 1, 2)) 3468 // b[i, i+1, i+2, i+3] = v2 - v3 3469 // br cond, vector.body, middle.block 3470 // 3471 // middle.block: 3472 // x = v2(3) 3473 // br scalar.ph 3474 // 3475 // scalar.ph: 3476 // s_init = phi [x, middle.block], [a[-1], otherwise] 3477 // br scalar.body 3478 // 3479 // After execution completes the vector loop, we extract the next value of 3480 // the recurrence (x) to use as the initial value in the scalar loop. 3481 3482 // Get the original loop preheader and single loop latch. 3483 auto *Preheader = OrigLoop->getLoopPreheader(); 3484 auto *Latch = OrigLoop->getLoopLatch(); 3485 3486 // Get the initial and previous values of the scalar recurrence. 3487 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3488 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3489 3490 // Create a vector from the initial value. 3491 auto *VectorInit = ScalarInit; 3492 if (VF > 1) { 3493 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3494 VectorInit = Builder.CreateInsertElement( 3495 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3496 Builder.getInt32(VF - 1), "vector.recur.init"); 3497 } 3498 3499 // We constructed a temporary phi node in the first phase of vectorization. 3500 // This phi node will eventually be deleted. 3501 Builder.SetInsertPoint( 3502 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3503 3504 // Create a phi node for the new recurrence. The current value will either be 3505 // the initial value inserted into a vector or loop-varying vector value. 3506 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3507 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3508 3509 // Get the vectorized previous value of the last part UF - 1. It appears last 3510 // among all unrolled iterations, due to the order of their construction. 3511 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3512 3513 // Set the insertion point after the previous value if it is an instruction. 3514 // Note that the previous value may have been constant-folded so it is not 3515 // guaranteed to be an instruction in the vector loop. Also, if the previous 3516 // value is a phi node, we should insert after all the phi nodes to avoid 3517 // breaking basic block verification. 3518 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) || 3519 isa<PHINode>(PreviousLastPart)) 3520 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3521 else 3522 Builder.SetInsertPoint( 3523 &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart))); 3524 3525 // We will construct a vector for the recurrence by combining the values for 3526 // the current and previous iterations. This is the required shuffle mask. 3527 SmallVector<Constant *, 8> ShuffleMask(VF); 3528 ShuffleMask[0] = Builder.getInt32(VF - 1); 3529 for (unsigned I = 1; I < VF; ++I) 3530 ShuffleMask[I] = Builder.getInt32(I + VF - 1); 3531 3532 // The vector from which to take the initial value for the current iteration 3533 // (actual or unrolled). Initially, this is the vector phi node. 3534 Value *Incoming = VecPhi; 3535 3536 // Shuffle the current and previous vector and update the vector parts. 3537 for (unsigned Part = 0; Part < UF; ++Part) { 3538 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3539 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3540 auto *Shuffle = 3541 VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3542 ConstantVector::get(ShuffleMask)) 3543 : Incoming; 3544 PhiPart->replaceAllUsesWith(Shuffle); 3545 cast<Instruction>(PhiPart)->eraseFromParent(); 3546 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3547 Incoming = PreviousPart; 3548 } 3549 3550 // Fix the latch value of the new recurrence in the vector loop. 3551 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3552 3553 // Extract the last vector element in the middle block. This will be the 3554 // initial value for the recurrence when jumping to the scalar loop. 3555 auto *ExtractForScalar = Incoming; 3556 if (VF > 1) { 3557 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3558 ExtractForScalar = Builder.CreateExtractElement( 3559 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3560 } 3561 // Extract the second last element in the middle block if the 3562 // Phi is used outside the loop. We need to extract the phi itself 3563 // and not the last element (the phi update in the current iteration). This 3564 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3565 // when the scalar loop is not run at all. 3566 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3567 if (VF > 1) 3568 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3569 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3570 // When loop is unrolled without vectorizing, initialize 3571 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3572 // `Incoming`. This is analogous to the vectorized case above: extracting the 3573 // second last element when VF > 1. 3574 else if (UF > 1) 3575 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3576 3577 // Fix the initial value of the original recurrence in the scalar loop. 3578 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3579 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3580 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3581 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3582 Start->addIncoming(Incoming, BB); 3583 } 3584 3585 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3586 Phi->setName("scalar.recur"); 3587 3588 // Finally, fix users of the recurrence outside the loop. The users will need 3589 // either the last value of the scalar recurrence or the last value of the 3590 // vector recurrence we extracted in the middle block. Since the loop is in 3591 // LCSSA form, we just need to find all the phi nodes for the original scalar 3592 // recurrence in the exit block, and then add an edge for the middle block. 3593 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3594 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3595 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3596 } 3597 } 3598 } 3599 3600 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3601 Constant *Zero = Builder.getInt32(0); 3602 3603 // Get it's reduction variable descriptor. 3604 assert(Legal->isReductionVariable(Phi) && 3605 "Unable to find the reduction variable"); 3606 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; 3607 3608 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3609 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3610 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3611 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3612 RdxDesc.getMinMaxRecurrenceKind(); 3613 setDebugLocFromInst(Builder, ReductionStartValue); 3614 3615 // We need to generate a reduction vector from the incoming scalar. 3616 // To do so, we need to generate the 'identity' vector and override 3617 // one of the elements with the incoming scalar reduction. We need 3618 // to do it in the vector-loop preheader. 3619 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3620 3621 // This is the vector-clone of the value that leaves the loop. 3622 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3623 3624 // Find the reduction identity variable. Zero for addition, or, xor, 3625 // one for multiplication, -1 for And. 3626 Value *Identity; 3627 Value *VectorStart; 3628 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3629 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3630 // MinMax reduction have the start value as their identify. 3631 if (VF == 1) { 3632 VectorStart = Identity = ReductionStartValue; 3633 } else { 3634 VectorStart = Identity = 3635 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3636 } 3637 } else { 3638 // Handle other reduction kinds: 3639 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3640 RK, VecTy->getScalarType()); 3641 if (VF == 1) { 3642 Identity = Iden; 3643 // This vector is the Identity vector where the first element is the 3644 // incoming scalar reduction. 3645 VectorStart = ReductionStartValue; 3646 } else { 3647 Identity = ConstantVector::getSplat(VF, Iden); 3648 3649 // This vector is the Identity vector where the first element is the 3650 // incoming scalar reduction. 3651 VectorStart = 3652 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3653 } 3654 } 3655 3656 // Fix the vector-loop phi. 3657 3658 // Reductions do not have to start at zero. They can start with 3659 // any loop invariant values. 3660 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3661 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3662 for (unsigned Part = 0; Part < UF; ++Part) { 3663 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3664 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3665 // Make sure to add the reduction stat value only to the 3666 // first unroll part. 3667 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3668 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3669 cast<PHINode>(VecRdxPhi) 3670 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3671 } 3672 3673 // Before each round, move the insertion point right between 3674 // the PHIs and the values we are going to write. 3675 // This allows us to write both PHINodes and the extractelement 3676 // instructions. 3677 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3678 3679 setDebugLocFromInst(Builder, LoopExitInst); 3680 3681 // If the vector reduction can be performed in a smaller type, we truncate 3682 // then extend the loop exit value to enable InstCombine to evaluate the 3683 // entire expression in the smaller type. 3684 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3685 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3686 Builder.SetInsertPoint( 3687 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3688 VectorParts RdxParts(UF); 3689 for (unsigned Part = 0; Part < UF; ++Part) { 3690 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3691 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3692 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3693 : Builder.CreateZExt(Trunc, VecTy); 3694 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3695 UI != RdxParts[Part]->user_end();) 3696 if (*UI != Trunc) { 3697 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3698 RdxParts[Part] = Extnd; 3699 } else { 3700 ++UI; 3701 } 3702 } 3703 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3704 for (unsigned Part = 0; Part < UF; ++Part) { 3705 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3706 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3707 } 3708 } 3709 3710 // Reduce all of the unrolled parts into a single vector. 3711 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3712 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3713 3714 // The middle block terminator has already been assigned a DebugLoc here (the 3715 // OrigLoop's single latch terminator). We want the whole middle block to 3716 // appear to execute on this line because: (a) it is all compiler generated, 3717 // (b) these instructions are always executed after evaluating the latch 3718 // conditional branch, and (c) other passes may add new predecessors which 3719 // terminate on this line. This is the easiest way to ensure we don't 3720 // accidentally cause an extra step back into the loop while debugging. 3721 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3722 for (unsigned Part = 1; Part < UF; ++Part) { 3723 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3724 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3725 // Floating point operations had to be 'fast' to enable the reduction. 3726 ReducedPartRdx = addFastMathFlag( 3727 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3728 ReducedPartRdx, "bin.rdx"), 3729 RdxDesc.getFastMathFlags()); 3730 else 3731 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3732 RdxPart); 3733 } 3734 3735 if (VF > 1) { 3736 bool NoNaN = Legal->hasFunNoNaNAttr(); 3737 ReducedPartRdx = 3738 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3739 // If the reduction can be performed in a smaller type, we need to extend 3740 // the reduction to the wider type before we branch to the original loop. 3741 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3742 ReducedPartRdx = 3743 RdxDesc.isSigned() 3744 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3745 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3746 } 3747 3748 // Create a phi node that merges control-flow from the backedge-taken check 3749 // block and the middle block. 3750 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3751 LoopScalarPreHeader->getTerminator()); 3752 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3753 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3754 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3755 3756 // Now, we need to fix the users of the reduction variable 3757 // inside and outside of the scalar remainder loop. 3758 // We know that the loop is in LCSSA form. We need to update the 3759 // PHI nodes in the exit blocks. 3760 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3761 // All PHINodes need to have a single entry edge, or two if 3762 // we already fixed them. 3763 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3764 3765 // We found a reduction value exit-PHI. Update it with the 3766 // incoming bypass edge. 3767 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3768 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3769 } // end of the LCSSA phi scan. 3770 3771 // Fix the scalar loop reduction variable with the incoming reduction sum 3772 // from the vector body and from the backedge value. 3773 int IncomingEdgeBlockIdx = 3774 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3775 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3776 // Pick the other block. 3777 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3778 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3779 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3780 } 3781 3782 void InnerLoopVectorizer::fixLCSSAPHIs() { 3783 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3784 if (LCSSAPhi.getNumIncomingValues() == 1) { 3785 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3786 // Non-instruction incoming values will have only one value. 3787 unsigned LastLane = 0; 3788 if (isa<Instruction>(IncomingValue)) 3789 LastLane = Cost->isUniformAfterVectorization( 3790 cast<Instruction>(IncomingValue), VF) 3791 ? 0 3792 : VF - 1; 3793 // Can be a loop invariant incoming value or the last scalar value to be 3794 // extracted from the vectorized loop. 3795 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3796 Value *lastIncomingValue = 3797 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3798 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3799 } 3800 } 3801 } 3802 3803 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3804 // The basic block and loop containing the predicated instruction. 3805 auto *PredBB = PredInst->getParent(); 3806 auto *VectorLoop = LI->getLoopFor(PredBB); 3807 3808 // Initialize a worklist with the operands of the predicated instruction. 3809 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3810 3811 // Holds instructions that we need to analyze again. An instruction may be 3812 // reanalyzed if we don't yet know if we can sink it or not. 3813 SmallVector<Instruction *, 8> InstsToReanalyze; 3814 3815 // Returns true if a given use occurs in the predicated block. Phi nodes use 3816 // their operands in their corresponding predecessor blocks. 3817 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3818 auto *I = cast<Instruction>(U.getUser()); 3819 BasicBlock *BB = I->getParent(); 3820 if (auto *Phi = dyn_cast<PHINode>(I)) 3821 BB = Phi->getIncomingBlock( 3822 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3823 return BB == PredBB; 3824 }; 3825 3826 // Iteratively sink the scalarized operands of the predicated instruction 3827 // into the block we created for it. When an instruction is sunk, it's 3828 // operands are then added to the worklist. The algorithm ends after one pass 3829 // through the worklist doesn't sink a single instruction. 3830 bool Changed; 3831 do { 3832 // Add the instructions that need to be reanalyzed to the worklist, and 3833 // reset the changed indicator. 3834 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3835 InstsToReanalyze.clear(); 3836 Changed = false; 3837 3838 while (!Worklist.empty()) { 3839 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3840 3841 // We can't sink an instruction if it is a phi node, is already in the 3842 // predicated block, is not in the loop, or may have side effects. 3843 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 3844 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 3845 continue; 3846 3847 // It's legal to sink the instruction if all its uses occur in the 3848 // predicated block. Otherwise, there's nothing to do yet, and we may 3849 // need to reanalyze the instruction. 3850 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 3851 InstsToReanalyze.push_back(I); 3852 continue; 3853 } 3854 3855 // Move the instruction to the beginning of the predicated block, and add 3856 // it's operands to the worklist. 3857 I->moveBefore(&*PredBB->getFirstInsertionPt()); 3858 Worklist.insert(I->op_begin(), I->op_end()); 3859 3860 // The sinking may have enabled other instructions to be sunk, so we will 3861 // need to iterate. 3862 Changed = true; 3863 } 3864 } while (Changed); 3865 } 3866 3867 void InnerLoopVectorizer::fixNonInductionPHIs() { 3868 for (PHINode *OrigPhi : OrigPHIsToFix) { 3869 PHINode *NewPhi = 3870 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 3871 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 3872 3873 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 3874 predecessors(OrigPhi->getParent())); 3875 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 3876 predecessors(NewPhi->getParent())); 3877 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 3878 "Scalar and Vector BB should have the same number of predecessors"); 3879 3880 // The insertion point in Builder may be invalidated by the time we get 3881 // here. Force the Builder insertion point to something valid so that we do 3882 // not run into issues during insertion point restore in 3883 // getOrCreateVectorValue calls below. 3884 Builder.SetInsertPoint(NewPhi); 3885 3886 // The predecessor order is preserved and we can rely on mapping between 3887 // scalar and vector block predecessors. 3888 for (unsigned i = 0; i < NumIncomingValues; ++i) { 3889 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 3890 3891 // When looking up the new scalar/vector values to fix up, use incoming 3892 // values from original phi. 3893 Value *ScIncV = 3894 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 3895 3896 // Scalar incoming value may need a broadcast 3897 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 3898 NewPhi->addIncoming(NewIncV, NewPredBB); 3899 } 3900 } 3901 } 3902 3903 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 3904 unsigned VF) { 3905 PHINode *P = cast<PHINode>(PN); 3906 if (EnableVPlanNativePath) { 3907 // Currently we enter here in the VPlan-native path for non-induction 3908 // PHIs where all control flow is uniform. We simply widen these PHIs. 3909 // Create a vector phi with no operands - the vector phi operands will be 3910 // set at the end of vector code generation. 3911 Type *VecTy = 3912 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 3913 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 3914 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 3915 OrigPHIsToFix.push_back(P); 3916 3917 return; 3918 } 3919 3920 assert(PN->getParent() == OrigLoop->getHeader() && 3921 "Non-header phis should have been handled elsewhere"); 3922 3923 // In order to support recurrences we need to be able to vectorize Phi nodes. 3924 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3925 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 3926 // this value when we vectorize all of the instructions that use the PHI. 3927 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 3928 for (unsigned Part = 0; Part < UF; ++Part) { 3929 // This is phase one of vectorizing PHIs. 3930 Type *VecTy = 3931 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 3932 Value *EntryPart = PHINode::Create( 3933 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 3934 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 3935 } 3936 return; 3937 } 3938 3939 setDebugLocFromInst(Builder, P); 3940 3941 // This PHINode must be an induction variable. 3942 // Make sure that we know about it. 3943 assert(Legal->getInductionVars()->count(P) && "Not an induction variable"); 3944 3945 InductionDescriptor II = Legal->getInductionVars()->lookup(P); 3946 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 3947 3948 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 3949 // which can be found from the original scalar operations. 3950 switch (II.getKind()) { 3951 case InductionDescriptor::IK_NoInduction: 3952 llvm_unreachable("Unknown induction"); 3953 case InductionDescriptor::IK_IntInduction: 3954 case InductionDescriptor::IK_FpInduction: 3955 llvm_unreachable("Integer/fp induction is handled elsewhere."); 3956 case InductionDescriptor::IK_PtrInduction: { 3957 // Handle the pointer induction variable case. 3958 assert(P->getType()->isPointerTy() && "Unexpected type."); 3959 // This is the normalized GEP that starts counting at zero. 3960 Value *PtrInd = Induction; 3961 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 3962 // Determine the number of scalars we need to generate for each unroll 3963 // iteration. If the instruction is uniform, we only need to generate the 3964 // first lane. Otherwise, we generate all VF values. 3965 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 3966 // These are the scalar results. Notice that we don't generate vector GEPs 3967 // because scalar GEPs result in better code. 3968 for (unsigned Part = 0; Part < UF; ++Part) { 3969 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 3970 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 3971 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 3972 Value *SclrGep = 3973 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 3974 SclrGep->setName("next.gep"); 3975 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 3976 } 3977 } 3978 return; 3979 } 3980 } 3981 } 3982 3983 /// A helper function for checking whether an integer division-related 3984 /// instruction may divide by zero (in which case it must be predicated if 3985 /// executed conditionally in the scalar code). 3986 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 3987 /// Non-zero divisors that are non compile-time constants will not be 3988 /// converted into multiplication, so we will still end up scalarizing 3989 /// the division, but can do so w/o predication. 3990 static bool mayDivideByZero(Instruction &I) { 3991 assert((I.getOpcode() == Instruction::UDiv || 3992 I.getOpcode() == Instruction::SDiv || 3993 I.getOpcode() == Instruction::URem || 3994 I.getOpcode() == Instruction::SRem) && 3995 "Unexpected instruction"); 3996 Value *Divisor = I.getOperand(1); 3997 auto *CInt = dyn_cast<ConstantInt>(Divisor); 3998 return !CInt || CInt->isZero(); 3999 } 4000 4001 void InnerLoopVectorizer::widenInstruction(Instruction &I) { 4002 switch (I.getOpcode()) { 4003 case Instruction::Br: 4004 case Instruction::PHI: 4005 llvm_unreachable("This instruction is handled by a different recipe."); 4006 case Instruction::GetElementPtr: { 4007 // Construct a vector GEP by widening the operands of the scalar GEP as 4008 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4009 // results in a vector of pointers when at least one operand of the GEP 4010 // is vector-typed. Thus, to keep the representation compact, we only use 4011 // vector-typed operands for loop-varying values. 4012 auto *GEP = cast<GetElementPtrInst>(&I); 4013 4014 if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) { 4015 // If we are vectorizing, but the GEP has only loop-invariant operands, 4016 // the GEP we build (by only using vector-typed operands for 4017 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4018 // produce a vector of pointers, we need to either arbitrarily pick an 4019 // operand to broadcast, or broadcast a clone of the original GEP. 4020 // Here, we broadcast a clone of the original. 4021 // 4022 // TODO: If at some point we decide to scalarize instructions having 4023 // loop-invariant operands, this special case will no longer be 4024 // required. We would add the scalarization decision to 4025 // collectLoopScalars() and teach getVectorValue() to broadcast 4026 // the lane-zero scalar value. 4027 auto *Clone = Builder.Insert(GEP->clone()); 4028 for (unsigned Part = 0; Part < UF; ++Part) { 4029 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4030 VectorLoopValueMap.setVectorValue(&I, Part, EntryPart); 4031 addMetadata(EntryPart, GEP); 4032 } 4033 } else { 4034 // If the GEP has at least one loop-varying operand, we are sure to 4035 // produce a vector of pointers. But if we are only unrolling, we want 4036 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4037 // produce with the code below will be scalar (if VF == 1) or vector 4038 // (otherwise). Note that for the unroll-only case, we still maintain 4039 // values in the vector mapping with initVector, as we do for other 4040 // instructions. 4041 for (unsigned Part = 0; Part < UF; ++Part) { 4042 // The pointer operand of the new GEP. If it's loop-invariant, we 4043 // won't broadcast it. 4044 auto *Ptr = 4045 OrigLoop->isLoopInvariant(GEP->getPointerOperand()) 4046 ? GEP->getPointerOperand() 4047 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4048 4049 // Collect all the indices for the new GEP. If any index is 4050 // loop-invariant, we won't broadcast it. 4051 SmallVector<Value *, 4> Indices; 4052 for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) { 4053 if (OrigLoop->isLoopInvariant(U.get())) 4054 Indices.push_back(U.get()); 4055 else 4056 Indices.push_back(getOrCreateVectorValue(U.get(), Part)); 4057 } 4058 4059 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4060 // but it should be a vector, otherwise. 4061 auto *NewGEP = 4062 GEP->isInBounds() 4063 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4064 Indices) 4065 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4066 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4067 "NewGEP is not a pointer vector"); 4068 VectorLoopValueMap.setVectorValue(&I, Part, NewGEP); 4069 addMetadata(NewGEP, GEP); 4070 } 4071 } 4072 4073 break; 4074 } 4075 case Instruction::UDiv: 4076 case Instruction::SDiv: 4077 case Instruction::SRem: 4078 case Instruction::URem: 4079 case Instruction::Add: 4080 case Instruction::FAdd: 4081 case Instruction::Sub: 4082 case Instruction::FSub: 4083 case Instruction::FNeg: 4084 case Instruction::Mul: 4085 case Instruction::FMul: 4086 case Instruction::FDiv: 4087 case Instruction::FRem: 4088 case Instruction::Shl: 4089 case Instruction::LShr: 4090 case Instruction::AShr: 4091 case Instruction::And: 4092 case Instruction::Or: 4093 case Instruction::Xor: { 4094 // Just widen unops and binops. 4095 setDebugLocFromInst(Builder, &I); 4096 4097 for (unsigned Part = 0; Part < UF; ++Part) { 4098 SmallVector<Value *, 2> Ops; 4099 for (Value *Op : I.operands()) 4100 Ops.push_back(getOrCreateVectorValue(Op, Part)); 4101 4102 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4103 4104 if (auto *VecOp = dyn_cast<Instruction>(V)) 4105 VecOp->copyIRFlags(&I); 4106 4107 // Use this vector value for all users of the original instruction. 4108 VectorLoopValueMap.setVectorValue(&I, Part, V); 4109 addMetadata(V, &I); 4110 } 4111 4112 break; 4113 } 4114 case Instruction::Select: { 4115 // Widen selects. 4116 // If the selector is loop invariant we can create a select 4117 // instruction with a scalar condition. Otherwise, use vector-select. 4118 auto *SE = PSE.getSE(); 4119 bool InvariantCond = 4120 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); 4121 setDebugLocFromInst(Builder, &I); 4122 4123 // The condition can be loop invariant but still defined inside the 4124 // loop. This means that we can't just use the original 'cond' value. 4125 // We have to take the 'vectorized' value and pick the first lane. 4126 // Instcombine will make this a no-op. 4127 4128 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); 4129 4130 for (unsigned Part = 0; Part < UF; ++Part) { 4131 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); 4132 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); 4133 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); 4134 Value *Sel = 4135 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); 4136 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4137 addMetadata(Sel, &I); 4138 } 4139 4140 break; 4141 } 4142 4143 case Instruction::ICmp: 4144 case Instruction::FCmp: { 4145 // Widen compares. Generate vector compares. 4146 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4147 auto *Cmp = dyn_cast<CmpInst>(&I); 4148 setDebugLocFromInst(Builder, Cmp); 4149 for (unsigned Part = 0; Part < UF; ++Part) { 4150 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); 4151 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); 4152 Value *C = nullptr; 4153 if (FCmp) { 4154 // Propagate fast math flags. 4155 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4156 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4157 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4158 } else { 4159 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4160 } 4161 VectorLoopValueMap.setVectorValue(&I, Part, C); 4162 addMetadata(C, &I); 4163 } 4164 4165 break; 4166 } 4167 4168 case Instruction::ZExt: 4169 case Instruction::SExt: 4170 case Instruction::FPToUI: 4171 case Instruction::FPToSI: 4172 case Instruction::FPExt: 4173 case Instruction::PtrToInt: 4174 case Instruction::IntToPtr: 4175 case Instruction::SIToFP: 4176 case Instruction::UIToFP: 4177 case Instruction::Trunc: 4178 case Instruction::FPTrunc: 4179 case Instruction::BitCast: { 4180 auto *CI = dyn_cast<CastInst>(&I); 4181 setDebugLocFromInst(Builder, CI); 4182 4183 /// Vectorize casts. 4184 Type *DestTy = 4185 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4186 4187 for (unsigned Part = 0; Part < UF; ++Part) { 4188 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); 4189 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4190 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4191 addMetadata(Cast, &I); 4192 } 4193 break; 4194 } 4195 4196 case Instruction::Call: { 4197 // Ignore dbg intrinsics. 4198 if (isa<DbgInfoIntrinsic>(I)) 4199 break; 4200 setDebugLocFromInst(Builder, &I); 4201 4202 Module *M = I.getParent()->getParent()->getParent(); 4203 auto *CI = cast<CallInst>(&I); 4204 4205 StringRef FnName = CI->getCalledFunction()->getName(); 4206 Function *F = CI->getCalledFunction(); 4207 Type *RetTy = ToVectorTy(CI->getType(), VF); 4208 SmallVector<Type *, 4> Tys; 4209 for (Value *ArgOperand : CI->arg_operands()) 4210 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4211 4212 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4213 4214 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4215 // version of the instruction. 4216 // Is it beneficial to perform intrinsic call compared to lib call? 4217 bool NeedToScalarize; 4218 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4219 bool UseVectorIntrinsic = 4220 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4221 assert((UseVectorIntrinsic || !NeedToScalarize) && 4222 "Instruction should be scalarized elsewhere."); 4223 4224 for (unsigned Part = 0; Part < UF; ++Part) { 4225 SmallVector<Value *, 4> Args; 4226 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 4227 Value *Arg = CI->getArgOperand(i); 4228 // Some intrinsics have a scalar argument - don't replace it with a 4229 // vector. 4230 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) 4231 Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); 4232 Args.push_back(Arg); 4233 } 4234 4235 Function *VectorF; 4236 if (UseVectorIntrinsic) { 4237 // Use vector version of the intrinsic. 4238 Type *TysForDecl[] = {CI->getType()}; 4239 if (VF > 1) 4240 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4241 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4242 } else { 4243 // Use vector version of the library call. 4244 StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); 4245 assert(!VFnName.empty() && "Vector function name is empty."); 4246 VectorF = M->getFunction(VFnName); 4247 if (!VectorF) { 4248 // Generate a declaration 4249 FunctionType *FTy = FunctionType::get(RetTy, Tys, false); 4250 VectorF = 4251 Function::Create(FTy, Function::ExternalLinkage, VFnName, M); 4252 VectorF->copyAttributesFrom(F); 4253 } 4254 } 4255 assert(VectorF && "Can't create vector function."); 4256 4257 SmallVector<OperandBundleDef, 1> OpBundles; 4258 CI->getOperandBundlesAsDefs(OpBundles); 4259 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4260 4261 if (isa<FPMathOperator>(V)) 4262 V->copyFastMathFlags(CI); 4263 4264 VectorLoopValueMap.setVectorValue(&I, Part, V); 4265 addMetadata(V, &I); 4266 } 4267 4268 break; 4269 } 4270 4271 default: 4272 // This instruction is not vectorized by simple widening. 4273 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4274 llvm_unreachable("Unhandled instruction!"); 4275 } // end of switch. 4276 } 4277 4278 void InnerLoopVectorizer::updateAnalysis() { 4279 // Forget the original basic block. 4280 PSE.getSE()->forgetLoop(OrigLoop); 4281 4282 // DT is not kept up-to-date for outer loop vectorization 4283 if (EnableVPlanNativePath) 4284 return; 4285 4286 // Update the dominator tree information. 4287 assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && 4288 "Entry does not dominate exit."); 4289 4290 DT->addNewBlock(LoopMiddleBlock, 4291 LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4292 DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); 4293 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 4294 DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); 4295 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 4296 } 4297 4298 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4299 // We should not collect Scalars more than once per VF. Right now, this 4300 // function is called from collectUniformsAndScalars(), which already does 4301 // this check. Collecting Scalars for VF=1 does not make any sense. 4302 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4303 "This function should not be visited twice for the same VF"); 4304 4305 SmallSetVector<Instruction *, 8> Worklist; 4306 4307 // These sets are used to seed the analysis with pointers used by memory 4308 // accesses that will remain scalar. 4309 SmallSetVector<Instruction *, 8> ScalarPtrs; 4310 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4311 4312 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4313 // The pointer operands of loads and stores will be scalar as long as the 4314 // memory access is not a gather or scatter operation. The value operand of a 4315 // store will remain scalar if the store is scalarized. 4316 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4317 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4318 assert(WideningDecision != CM_Unknown && 4319 "Widening decision should be ready at this moment"); 4320 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4321 if (Ptr == Store->getValueOperand()) 4322 return WideningDecision == CM_Scalarize; 4323 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4324 "Ptr is neither a value or pointer operand"); 4325 return WideningDecision != CM_GatherScatter; 4326 }; 4327 4328 // A helper that returns true if the given value is a bitcast or 4329 // getelementptr instruction contained in the loop. 4330 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4331 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4332 isa<GetElementPtrInst>(V)) && 4333 !TheLoop->isLoopInvariant(V); 4334 }; 4335 4336 // A helper that evaluates a memory access's use of a pointer. If the use 4337 // will be a scalar use, and the pointer is only used by memory accesses, we 4338 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4339 // PossibleNonScalarPtrs. 4340 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4341 // We only care about bitcast and getelementptr instructions contained in 4342 // the loop. 4343 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4344 return; 4345 4346 // If the pointer has already been identified as scalar (e.g., if it was 4347 // also identified as uniform), there's nothing to do. 4348 auto *I = cast<Instruction>(Ptr); 4349 if (Worklist.count(I)) 4350 return; 4351 4352 // If the use of the pointer will be a scalar use, and all users of the 4353 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4354 // place the pointer in PossibleNonScalarPtrs. 4355 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4356 return isa<LoadInst>(U) || isa<StoreInst>(U); 4357 })) 4358 ScalarPtrs.insert(I); 4359 else 4360 PossibleNonScalarPtrs.insert(I); 4361 }; 4362 4363 // We seed the scalars analysis with three classes of instructions: (1) 4364 // instructions marked uniform-after-vectorization, (2) bitcast and 4365 // getelementptr instructions used by memory accesses requiring a scalar use, 4366 // and (3) pointer induction variables and their update instructions (we 4367 // currently only scalarize these). 4368 // 4369 // (1) Add to the worklist all instructions that have been identified as 4370 // uniform-after-vectorization. 4371 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4372 4373 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4374 // memory accesses requiring a scalar use. The pointer operands of loads and 4375 // stores will be scalar as long as the memory accesses is not a gather or 4376 // scatter operation. The value operand of a store will remain scalar if the 4377 // store is scalarized. 4378 for (auto *BB : TheLoop->blocks()) 4379 for (auto &I : *BB) { 4380 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4381 evaluatePtrUse(Load, Load->getPointerOperand()); 4382 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4383 evaluatePtrUse(Store, Store->getPointerOperand()); 4384 evaluatePtrUse(Store, Store->getValueOperand()); 4385 } 4386 } 4387 for (auto *I : ScalarPtrs) 4388 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { 4389 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4390 Worklist.insert(I); 4391 } 4392 4393 // (3) Add to the worklist all pointer induction variables and their update 4394 // instructions. 4395 // 4396 // TODO: Once we are able to vectorize pointer induction variables we should 4397 // no longer insert them into the worklist here. 4398 auto *Latch = TheLoop->getLoopLatch(); 4399 for (auto &Induction : *Legal->getInductionVars()) { 4400 auto *Ind = Induction.first; 4401 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4402 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4403 continue; 4404 Worklist.insert(Ind); 4405 Worklist.insert(IndUpdate); 4406 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4407 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4408 << "\n"); 4409 } 4410 4411 // Insert the forced scalars. 4412 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4413 // induction variable when the PHI user is scalarized. 4414 auto ForcedScalar = ForcedScalars.find(VF); 4415 if (ForcedScalar != ForcedScalars.end()) 4416 for (auto *I : ForcedScalar->second) 4417 Worklist.insert(I); 4418 4419 // Expand the worklist by looking through any bitcasts and getelementptr 4420 // instructions we've already identified as scalar. This is similar to the 4421 // expansion step in collectLoopUniforms(); however, here we're only 4422 // expanding to include additional bitcasts and getelementptr instructions. 4423 unsigned Idx = 0; 4424 while (Idx != Worklist.size()) { 4425 Instruction *Dst = Worklist[Idx++]; 4426 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4427 continue; 4428 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4429 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4430 auto *J = cast<Instruction>(U); 4431 return !TheLoop->contains(J) || Worklist.count(J) || 4432 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4433 isScalarUse(J, Src)); 4434 })) { 4435 Worklist.insert(Src); 4436 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4437 } 4438 } 4439 4440 // An induction variable will remain scalar if all users of the induction 4441 // variable and induction variable update remain scalar. 4442 for (auto &Induction : *Legal->getInductionVars()) { 4443 auto *Ind = Induction.first; 4444 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4445 4446 // We already considered pointer induction variables, so there's no reason 4447 // to look at their users again. 4448 // 4449 // TODO: Once we are able to vectorize pointer induction variables we 4450 // should no longer skip over them here. 4451 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4452 continue; 4453 4454 // Determine if all users of the induction variable are scalar after 4455 // vectorization. 4456 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4457 auto *I = cast<Instruction>(U); 4458 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4459 }); 4460 if (!ScalarInd) 4461 continue; 4462 4463 // Determine if all users of the induction variable update instruction are 4464 // scalar after vectorization. 4465 auto ScalarIndUpdate = 4466 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4467 auto *I = cast<Instruction>(U); 4468 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4469 }); 4470 if (!ScalarIndUpdate) 4471 continue; 4472 4473 // The induction variable and its update instruction will remain scalar. 4474 Worklist.insert(Ind); 4475 Worklist.insert(IndUpdate); 4476 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4477 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4478 << "\n"); 4479 } 4480 4481 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4482 } 4483 4484 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4485 if (!blockNeedsPredication(I->getParent())) 4486 return false; 4487 switch(I->getOpcode()) { 4488 default: 4489 break; 4490 case Instruction::Load: 4491 case Instruction::Store: { 4492 if (!Legal->isMaskRequired(I)) 4493 return false; 4494 auto *Ptr = getLoadStorePointerOperand(I); 4495 auto *Ty = getMemInstValueType(I); 4496 // We have already decided how to vectorize this instruction, get that 4497 // result. 4498 if (VF > 1) { 4499 InstWidening WideningDecision = getWideningDecision(I, VF); 4500 assert(WideningDecision != CM_Unknown && 4501 "Widening decision should be ready at this moment"); 4502 return WideningDecision == CM_Scalarize; 4503 } 4504 return isa<LoadInst>(I) ? 4505 !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty)) 4506 : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty)); 4507 } 4508 case Instruction::UDiv: 4509 case Instruction::SDiv: 4510 case Instruction::SRem: 4511 case Instruction::URem: 4512 return mayDivideByZero(*I); 4513 } 4514 return false; 4515 } 4516 4517 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4518 unsigned VF) { 4519 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4520 assert(getWideningDecision(I, VF) == CM_Unknown && 4521 "Decision should not be set yet."); 4522 auto *Group = getInterleavedAccessGroup(I); 4523 assert(Group && "Must have a group."); 4524 4525 // If the instruction's allocated size doesn't equal it's type size, it 4526 // requires padding and will be scalarized. 4527 auto &DL = I->getModule()->getDataLayout(); 4528 auto *ScalarTy = getMemInstValueType(I); 4529 if (hasIrregularType(ScalarTy, DL, VF)) 4530 return false; 4531 4532 // Check if masking is required. 4533 // A Group may need masking for one of two reasons: it resides in a block that 4534 // needs predication, or it was decided to use masking to deal with gaps. 4535 bool PredicatedAccessRequiresMasking = 4536 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4537 bool AccessWithGapsRequiresMasking = 4538 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4539 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4540 return true; 4541 4542 // If masked interleaving is required, we expect that the user/target had 4543 // enabled it, because otherwise it either wouldn't have been created or 4544 // it should have been invalidated by the CostModel. 4545 assert(useMaskedInterleavedAccesses(TTI) && 4546 "Masked interleave-groups for predicated accesses are not enabled."); 4547 4548 auto *Ty = getMemInstValueType(I); 4549 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty) 4550 : TTI.isLegalMaskedStore(Ty); 4551 } 4552 4553 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4554 unsigned VF) { 4555 // Get and ensure we have a valid memory instruction. 4556 LoadInst *LI = dyn_cast<LoadInst>(I); 4557 StoreInst *SI = dyn_cast<StoreInst>(I); 4558 assert((LI || SI) && "Invalid memory instruction"); 4559 4560 auto *Ptr = getLoadStorePointerOperand(I); 4561 4562 // In order to be widened, the pointer should be consecutive, first of all. 4563 if (!Legal->isConsecutivePtr(Ptr)) 4564 return false; 4565 4566 // If the instruction is a store located in a predicated block, it will be 4567 // scalarized. 4568 if (isScalarWithPredication(I)) 4569 return false; 4570 4571 // If the instruction's allocated size doesn't equal it's type size, it 4572 // requires padding and will be scalarized. 4573 auto &DL = I->getModule()->getDataLayout(); 4574 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4575 if (hasIrregularType(ScalarTy, DL, VF)) 4576 return false; 4577 4578 return true; 4579 } 4580 4581 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4582 // We should not collect Uniforms more than once per VF. Right now, 4583 // this function is called from collectUniformsAndScalars(), which 4584 // already does this check. Collecting Uniforms for VF=1 does not make any 4585 // sense. 4586 4587 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4588 "This function should not be visited twice for the same VF"); 4589 4590 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4591 // not analyze again. Uniforms.count(VF) will return 1. 4592 Uniforms[VF].clear(); 4593 4594 // We now know that the loop is vectorizable! 4595 // Collect instructions inside the loop that will remain uniform after 4596 // vectorization. 4597 4598 // Global values, params and instructions outside of current loop are out of 4599 // scope. 4600 auto isOutOfScope = [&](Value *V) -> bool { 4601 Instruction *I = dyn_cast<Instruction>(V); 4602 return (!I || !TheLoop->contains(I)); 4603 }; 4604 4605 SetVector<Instruction *> Worklist; 4606 BasicBlock *Latch = TheLoop->getLoopLatch(); 4607 4608 // Start with the conditional branch. If the branch condition is an 4609 // instruction contained in the loop that is only used by the branch, it is 4610 // uniform. 4611 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4612 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) { 4613 Worklist.insert(Cmp); 4614 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); 4615 } 4616 4617 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4618 // are pointers that are treated like consecutive pointers during 4619 // vectorization. The pointer operands of interleaved accesses are an 4620 // example. 4621 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4622 4623 // Holds pointer operands of instructions that are possibly non-uniform. 4624 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4625 4626 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4627 InstWidening WideningDecision = getWideningDecision(I, VF); 4628 assert(WideningDecision != CM_Unknown && 4629 "Widening decision should be ready at this moment"); 4630 4631 return (WideningDecision == CM_Widen || 4632 WideningDecision == CM_Widen_Reverse || 4633 WideningDecision == CM_Interleave); 4634 }; 4635 // Iterate over the instructions in the loop, and collect all 4636 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4637 // that a consecutive-like pointer operand will be scalarized, we collect it 4638 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4639 // getelementptr instruction can be used by both vectorized and scalarized 4640 // memory instructions. For example, if a loop loads and stores from the same 4641 // location, but the store is conditional, the store will be scalarized, and 4642 // the getelementptr won't remain uniform. 4643 for (auto *BB : TheLoop->blocks()) 4644 for (auto &I : *BB) { 4645 // If there's no pointer operand, there's nothing to do. 4646 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4647 if (!Ptr) 4648 continue; 4649 4650 // True if all users of Ptr are memory accesses that have Ptr as their 4651 // pointer operand. 4652 auto UsersAreMemAccesses = 4653 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4654 return getLoadStorePointerOperand(U) == Ptr; 4655 }); 4656 4657 // Ensure the memory instruction will not be scalarized or used by 4658 // gather/scatter, making its pointer operand non-uniform. If the pointer 4659 // operand is used by any instruction other than a memory access, we 4660 // conservatively assume the pointer operand may be non-uniform. 4661 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4662 PossibleNonUniformPtrs.insert(Ptr); 4663 4664 // If the memory instruction will be vectorized and its pointer operand 4665 // is consecutive-like, or interleaving - the pointer operand should 4666 // remain uniform. 4667 else 4668 ConsecutiveLikePtrs.insert(Ptr); 4669 } 4670 4671 // Add to the Worklist all consecutive and consecutive-like pointers that 4672 // aren't also identified as possibly non-uniform. 4673 for (auto *V : ConsecutiveLikePtrs) 4674 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) { 4675 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n"); 4676 Worklist.insert(V); 4677 } 4678 4679 // Expand Worklist in topological order: whenever a new instruction 4680 // is added , its users should be already inside Worklist. It ensures 4681 // a uniform instruction will only be used by uniform instructions. 4682 unsigned idx = 0; 4683 while (idx != Worklist.size()) { 4684 Instruction *I = Worklist[idx++]; 4685 4686 for (auto OV : I->operand_values()) { 4687 // isOutOfScope operands cannot be uniform instructions. 4688 if (isOutOfScope(OV)) 4689 continue; 4690 // First order recurrence Phi's should typically be considered 4691 // non-uniform. 4692 auto *OP = dyn_cast<PHINode>(OV); 4693 if (OP && Legal->isFirstOrderRecurrence(OP)) 4694 continue; 4695 // If all the users of the operand are uniform, then add the 4696 // operand into the uniform worklist. 4697 auto *OI = cast<Instruction>(OV); 4698 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4699 auto *J = cast<Instruction>(U); 4700 return Worklist.count(J) || 4701 (OI == getLoadStorePointerOperand(J) && 4702 isUniformDecision(J, VF)); 4703 })) { 4704 Worklist.insert(OI); 4705 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); 4706 } 4707 } 4708 } 4709 4710 // Returns true if Ptr is the pointer operand of a memory access instruction 4711 // I, and I is known to not require scalarization. 4712 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4713 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4714 }; 4715 4716 // For an instruction to be added into Worklist above, all its users inside 4717 // the loop should also be in Worklist. However, this condition cannot be 4718 // true for phi nodes that form a cyclic dependence. We must process phi 4719 // nodes separately. An induction variable will remain uniform if all users 4720 // of the induction variable and induction variable update remain uniform. 4721 // The code below handles both pointer and non-pointer induction variables. 4722 for (auto &Induction : *Legal->getInductionVars()) { 4723 auto *Ind = Induction.first; 4724 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4725 4726 // Determine if all users of the induction variable are uniform after 4727 // vectorization. 4728 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4729 auto *I = cast<Instruction>(U); 4730 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4731 isVectorizedMemAccessUse(I, Ind); 4732 }); 4733 if (!UniformInd) 4734 continue; 4735 4736 // Determine if all users of the induction variable update instruction are 4737 // uniform after vectorization. 4738 auto UniformIndUpdate = 4739 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4740 auto *I = cast<Instruction>(U); 4741 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4742 isVectorizedMemAccessUse(I, IndUpdate); 4743 }); 4744 if (!UniformIndUpdate) 4745 continue; 4746 4747 // The induction variable and its update instruction will remain uniform. 4748 Worklist.insert(Ind); 4749 Worklist.insert(IndUpdate); 4750 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n"); 4751 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate 4752 << "\n"); 4753 } 4754 4755 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4756 } 4757 4758 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4759 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4760 4761 if (Legal->getRuntimePointerChecking()->Need) { 4762 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4763 "runtime pointer checks needed. Enable vectorization of this " 4764 "loop with '#pragma clang loop vectorize(enable)' when " 4765 "compiling with -Os/-Oz", 4766 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4767 return true; 4768 } 4769 4770 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4771 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4772 "runtime SCEV checks needed. Enable vectorization of this " 4773 "loop with '#pragma clang loop vectorize(enable)' when " 4774 "compiling with -Os/-Oz", 4775 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4776 return true; 4777 } 4778 4779 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4780 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4781 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4782 "runtime stride == 1 checks needed. Enable vectorization of " 4783 "this loop with '#pragma clang loop vectorize(enable)' when " 4784 "compiling with -Os/-Oz", 4785 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4786 return true; 4787 } 4788 4789 return false; 4790 } 4791 4792 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { 4793 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4794 // TODO: It may by useful to do since it's still likely to be dynamically 4795 // uniform if the target can skip. 4796 reportVectorizationFailure( 4797 "Not inserting runtime ptr check for divergent target", 4798 "runtime pointer checks needed. Not enabled for divergent target", 4799 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4800 return None; 4801 } 4802 4803 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4804 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4805 if (TC == 1) { 4806 reportVectorizationFailure("Single iteration (non) loop", 4807 "loop trip count is one, irrelevant for vectorization", 4808 "SingleIterationLoop", ORE, TheLoop); 4809 return None; 4810 } 4811 4812 switch (ScalarEpilogueStatus) { 4813 case CM_ScalarEpilogueAllowed: 4814 return computeFeasibleMaxVF(TC); 4815 case CM_ScalarEpilogueNotNeededUsePredicate: 4816 LLVM_DEBUG( 4817 dbgs() << "LV: vector predicate hint/switch found.\n" 4818 << "LV: Not allowing scalar epilogue, creating predicated " 4819 << "vector loop.\n"); 4820 break; 4821 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4822 // fallthrough as a special case of OptForSize 4823 case CM_ScalarEpilogueNotAllowedOptSize: 4824 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4825 LLVM_DEBUG( 4826 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4827 else 4828 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4829 << "count.\n"); 4830 4831 // Bail if runtime checks are required, which are not good when optimising 4832 // for size. 4833 if (runtimeChecksRequired()) 4834 return None; 4835 break; 4836 } 4837 4838 // Now try the tail folding 4839 4840 // Invalidate interleave groups that require an epilogue if we can't mask 4841 // the interleave-group. 4842 if (!useMaskedInterleavedAccesses(TTI)) 4843 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4844 4845 unsigned MaxVF = computeFeasibleMaxVF(TC); 4846 if (TC > 0 && TC % MaxVF == 0) { 4847 // Accept MaxVF if we do not have a tail. 4848 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4849 return MaxVF; 4850 } 4851 4852 // If we don't know the precise trip count, or if the trip count that we 4853 // found modulo the vectorization factor is not zero, try to fold the tail 4854 // by masking. 4855 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4856 if (Legal->prepareToFoldTailByMasking()) { 4857 FoldTailByMasking = true; 4858 return MaxVF; 4859 } 4860 4861 if (TC == 0) { 4862 reportVectorizationFailure( 4863 "Unable to calculate the loop count due to complex control flow", 4864 "unable to calculate the loop count due to complex control flow", 4865 "UnknownLoopCountComplexCFG", ORE, TheLoop); 4866 return None; 4867 } 4868 4869 reportVectorizationFailure( 4870 "Cannot optimize for size and vectorize at the same time.", 4871 "cannot optimize for size and vectorize at the same time. " 4872 "Enable vectorization of this loop with '#pragma clang loop " 4873 "vectorize(enable)' when compiling with -Os/-Oz", 4874 "NoTailLoopWithOptForSize", ORE, TheLoop); 4875 return None; 4876 } 4877 4878 unsigned 4879 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 4880 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4881 unsigned SmallestType, WidestType; 4882 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4883 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 4884 4885 // Get the maximum safe dependence distance in bits computed by LAA. 4886 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4887 // the memory accesses that is most restrictive (involved in the smallest 4888 // dependence distance). 4889 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 4890 4891 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 4892 4893 unsigned MaxVectorSize = WidestRegister / WidestType; 4894 4895 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4896 << " / " << WidestType << " bits.\n"); 4897 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 4898 << WidestRegister << " bits.\n"); 4899 4900 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 4901 " into one vector!"); 4902 if (MaxVectorSize == 0) { 4903 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 4904 MaxVectorSize = 1; 4905 return MaxVectorSize; 4906 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 4907 isPowerOf2_32(ConstTripCount)) { 4908 // We need to clamp the VF to be the ConstTripCount. There is no point in 4909 // choosing a higher viable VF as done in the loop below. 4910 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 4911 << ConstTripCount << "\n"); 4912 MaxVectorSize = ConstTripCount; 4913 return MaxVectorSize; 4914 } 4915 4916 unsigned MaxVF = MaxVectorSize; 4917 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 4918 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 4919 // Collect all viable vectorization factors larger than the default MaxVF 4920 // (i.e. MaxVectorSize). 4921 SmallVector<unsigned, 8> VFs; 4922 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 4923 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 4924 VFs.push_back(VS); 4925 4926 // For each VF calculate its register usage. 4927 auto RUs = calculateRegisterUsage(VFs); 4928 4929 // Select the largest VF which doesn't require more registers than existing 4930 // ones. 4931 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); 4932 for (int i = RUs.size() - 1; i >= 0; --i) { 4933 if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { 4934 MaxVF = VFs[i]; 4935 break; 4936 } 4937 } 4938 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 4939 if (MaxVF < MinVF) { 4940 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 4941 << ") with target's minimum: " << MinVF << '\n'); 4942 MaxVF = MinVF; 4943 } 4944 } 4945 } 4946 return MaxVF; 4947 } 4948 4949 VectorizationFactor 4950 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 4951 float Cost = expectedCost(1).first; 4952 const float ScalarCost = Cost; 4953 unsigned Width = 1; 4954 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 4955 4956 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 4957 if (ForceVectorization && MaxVF > 1) { 4958 // Ignore scalar width, because the user explicitly wants vectorization. 4959 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 4960 // evaluation. 4961 Cost = std::numeric_limits<float>::max(); 4962 } 4963 4964 for (unsigned i = 2; i <= MaxVF; i *= 2) { 4965 // Notice that the vector loop needs to be executed less times, so 4966 // we need to divide the cost of the vector loops by the width of 4967 // the vector elements. 4968 VectorizationCostTy C = expectedCost(i); 4969 float VectorCost = C.first / (float)i; 4970 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 4971 << " costs: " << (int)VectorCost << ".\n"); 4972 if (!C.second && !ForceVectorization) { 4973 LLVM_DEBUG( 4974 dbgs() << "LV: Not considering vector loop of width " << i 4975 << " because it will not generate any vector instructions.\n"); 4976 continue; 4977 } 4978 if (VectorCost < Cost) { 4979 Cost = VectorCost; 4980 Width = i; 4981 } 4982 } 4983 4984 if (!EnableCondStoresVectorization && NumPredStores) { 4985 reportVectorizationFailure("There are conditional stores.", 4986 "store that is conditionally executed prevents vectorization", 4987 "ConditionalStore", ORE, TheLoop); 4988 Width = 1; 4989 Cost = ScalarCost; 4990 } 4991 4992 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 4993 << "LV: Vectorization seems to be not beneficial, " 4994 << "but was forced by a user.\n"); 4995 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 4996 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 4997 return Factor; 4998 } 4999 5000 std::pair<unsigned, unsigned> 5001 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5002 unsigned MinWidth = -1U; 5003 unsigned MaxWidth = 8; 5004 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5005 5006 // For each block. 5007 for (BasicBlock *BB : TheLoop->blocks()) { 5008 // For each instruction in the loop. 5009 for (Instruction &I : BB->instructionsWithoutDebug()) { 5010 Type *T = I.getType(); 5011 5012 // Skip ignored values. 5013 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) 5014 continue; 5015 5016 // Only examine Loads, Stores and PHINodes. 5017 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5018 continue; 5019 5020 // Examine PHI nodes that are reduction variables. Update the type to 5021 // account for the recurrence type. 5022 if (auto *PN = dyn_cast<PHINode>(&I)) { 5023 if (!Legal->isReductionVariable(PN)) 5024 continue; 5025 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; 5026 T = RdxDesc.getRecurrenceType(); 5027 } 5028 5029 // Examine the stored values. 5030 if (auto *ST = dyn_cast<StoreInst>(&I)) 5031 T = ST->getValueOperand()->getType(); 5032 5033 // Ignore loaded pointer types and stored pointer types that are not 5034 // vectorizable. 5035 // 5036 // FIXME: The check here attempts to predict whether a load or store will 5037 // be vectorized. We only know this for certain after a VF has 5038 // been selected. Here, we assume that if an access can be 5039 // vectorized, it will be. We should also look at extending this 5040 // optimization to non-pointer types. 5041 // 5042 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5043 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5044 continue; 5045 5046 MinWidth = std::min(MinWidth, 5047 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5048 MaxWidth = std::max(MaxWidth, 5049 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5050 } 5051 } 5052 5053 return {MinWidth, MaxWidth}; 5054 } 5055 5056 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5057 unsigned LoopCost) { 5058 // -- The interleave heuristics -- 5059 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5060 // There are many micro-architectural considerations that we can't predict 5061 // at this level. For example, frontend pressure (on decode or fetch) due to 5062 // code size, or the number and capabilities of the execution ports. 5063 // 5064 // We use the following heuristics to select the interleave count: 5065 // 1. If the code has reductions, then we interleave to break the cross 5066 // iteration dependency. 5067 // 2. If the loop is really small, then we interleave to reduce the loop 5068 // overhead. 5069 // 3. We don't interleave if we think that we will spill registers to memory 5070 // due to the increased register pressure. 5071 5072 if (!isScalarEpilogueAllowed()) 5073 return 1; 5074 5075 // We used the distance for the interleave count. 5076 if (Legal->getMaxSafeDepDistBytes() != -1U) 5077 return 1; 5078 5079 // Do not interleave loops with a relatively small trip count. 5080 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5081 if (TC > 1 && TC < TinyTripCountInterleaveThreshold) 5082 return 1; 5083 5084 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); 5085 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5086 << " registers\n"); 5087 5088 if (VF == 1) { 5089 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5090 TargetNumRegisters = ForceTargetNumScalarRegs; 5091 } else { 5092 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5093 TargetNumRegisters = ForceTargetNumVectorRegs; 5094 } 5095 5096 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5097 // We divide by these constants so assume that we have at least one 5098 // instruction that uses at least one register. 5099 R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); 5100 5101 // We calculate the interleave count using the following formula. 5102 // Subtract the number of loop invariants from the number of available 5103 // registers. These registers are used by all of the interleaved instances. 5104 // Next, divide the remaining registers by the number of registers that is 5105 // required by the loop, in order to estimate how many parallel instances 5106 // fit without causing spills. All of this is rounded down if necessary to be 5107 // a power of two. We want power of two interleave count to simplify any 5108 // addressing operations or alignment considerations. 5109 // We also want power of two interleave counts to ensure that the induction 5110 // variable of the vector loop wraps to zero, when tail is folded by masking; 5111 // this currently happens when OptForSize, in which case IC is set to 1 above. 5112 unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / 5113 R.MaxLocalUsers); 5114 5115 // Don't count the induction variable as interleaved. 5116 if (EnableIndVarRegisterHeur) 5117 IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / 5118 std::max(1U, (R.MaxLocalUsers - 1))); 5119 5120 // Clamp the interleave ranges to reasonable counts. 5121 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5122 5123 // Check if the user has overridden the max. 5124 if (VF == 1) { 5125 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5126 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5127 } else { 5128 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5129 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5130 } 5131 5132 // If the trip count is constant, limit the interleave count to be less than 5133 // the trip count divided by VF. 5134 if (TC > 0) { 5135 assert(TC >= VF && "VF exceeds trip count?"); 5136 if ((TC / VF) < MaxInterleaveCount) 5137 MaxInterleaveCount = (TC / VF); 5138 } 5139 5140 // If we did not calculate the cost for VF (because the user selected the VF) 5141 // then we calculate the cost of VF here. 5142 if (LoopCost == 0) 5143 LoopCost = expectedCost(VF).first; 5144 5145 assert(LoopCost && "Non-zero loop cost expected"); 5146 5147 // Clamp the calculated IC to be between the 1 and the max interleave count 5148 // that the target and trip count allows. 5149 if (IC > MaxInterleaveCount) 5150 IC = MaxInterleaveCount; 5151 else if (IC < 1) 5152 IC = 1; 5153 5154 // Interleave if we vectorized this loop and there is a reduction that could 5155 // benefit from interleaving. 5156 if (VF > 1 && !Legal->getReductionVars()->empty()) { 5157 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5158 return IC; 5159 } 5160 5161 // Note that if we've already vectorized the loop we will have done the 5162 // runtime check and so interleaving won't require further checks. 5163 bool InterleavingRequiresRuntimePointerCheck = 5164 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5165 5166 // We want to interleave small loops in order to reduce the loop overhead and 5167 // potentially expose ILP opportunities. 5168 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5169 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5170 // We assume that the cost overhead is 1 and we use the cost model 5171 // to estimate the cost of the loop and interleave until the cost of the 5172 // loop overhead is about 5% of the cost of the loop. 5173 unsigned SmallIC = 5174 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5175 5176 // Interleave until store/load ports (estimated by max interleave count) are 5177 // saturated. 5178 unsigned NumStores = Legal->getNumStores(); 5179 unsigned NumLoads = Legal->getNumLoads(); 5180 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5181 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5182 5183 // If we have a scalar reduction (vector reductions are already dealt with 5184 // by this point), we can increase the critical path length if the loop 5185 // we're interleaving is inside another loop. Limit, by default to 2, so the 5186 // critical path only gets increased by one reduction operation. 5187 if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) { 5188 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5189 SmallIC = std::min(SmallIC, F); 5190 StoresIC = std::min(StoresIC, F); 5191 LoadsIC = std::min(LoadsIC, F); 5192 } 5193 5194 if (EnableLoadStoreRuntimeInterleave && 5195 std::max(StoresIC, LoadsIC) > SmallIC) { 5196 LLVM_DEBUG( 5197 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5198 return std::max(StoresIC, LoadsIC); 5199 } 5200 5201 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5202 return SmallIC; 5203 } 5204 5205 // Interleave if this is a large loop (small loops are already dealt with by 5206 // this point) that could benefit from interleaving. 5207 bool HasReductions = !Legal->getReductionVars()->empty(); 5208 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5209 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5210 return IC; 5211 } 5212 5213 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5214 return 1; 5215 } 5216 5217 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5218 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5219 // This function calculates the register usage by measuring the highest number 5220 // of values that are alive at a single location. Obviously, this is a very 5221 // rough estimation. We scan the loop in a topological order in order and 5222 // assign a number to each instruction. We use RPO to ensure that defs are 5223 // met before their users. We assume that each instruction that has in-loop 5224 // users starts an interval. We record every time that an in-loop value is 5225 // used, so we have a list of the first and last occurrences of each 5226 // instruction. Next, we transpose this data structure into a multi map that 5227 // holds the list of intervals that *end* at a specific location. This multi 5228 // map allows us to perform a linear search. We scan the instructions linearly 5229 // and record each time that a new interval starts, by placing it in a set. 5230 // If we find this value in the multi-map then we remove it from the set. 5231 // The max register usage is the maximum size of the set. 5232 // We also search for instructions that are defined outside the loop, but are 5233 // used inside the loop. We need this number separately from the max-interval 5234 // usage number because when we unroll, loop-invariant values do not take 5235 // more register. 5236 LoopBlocksDFS DFS(TheLoop); 5237 DFS.perform(LI); 5238 5239 RegisterUsage RU; 5240 5241 // Each 'key' in the map opens a new interval. The values 5242 // of the map are the index of the 'last seen' usage of the 5243 // instruction that is the key. 5244 using IntervalMap = DenseMap<Instruction *, unsigned>; 5245 5246 // Maps instruction to its index. 5247 SmallVector<Instruction *, 64> IdxToInstr; 5248 // Marks the end of each interval. 5249 IntervalMap EndPoint; 5250 // Saves the list of instruction indices that are used in the loop. 5251 SmallPtrSet<Instruction *, 8> Ends; 5252 // Saves the list of values that are used in the loop but are 5253 // defined outside the loop, such as arguments and constants. 5254 SmallPtrSet<Value *, 8> LoopInvariants; 5255 5256 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5257 for (Instruction &I : BB->instructionsWithoutDebug()) { 5258 IdxToInstr.push_back(&I); 5259 5260 // Save the end location of each USE. 5261 for (Value *U : I.operands()) { 5262 auto *Instr = dyn_cast<Instruction>(U); 5263 5264 // Ignore non-instruction values such as arguments, constants, etc. 5265 if (!Instr) 5266 continue; 5267 5268 // If this instruction is outside the loop then record it and continue. 5269 if (!TheLoop->contains(Instr)) { 5270 LoopInvariants.insert(Instr); 5271 continue; 5272 } 5273 5274 // Overwrite previous end points. 5275 EndPoint[Instr] = IdxToInstr.size(); 5276 Ends.insert(Instr); 5277 } 5278 } 5279 } 5280 5281 // Saves the list of intervals that end with the index in 'key'. 5282 using InstrList = SmallVector<Instruction *, 2>; 5283 DenseMap<unsigned, InstrList> TransposeEnds; 5284 5285 // Transpose the EndPoints to a list of values that end at each index. 5286 for (auto &Interval : EndPoint) 5287 TransposeEnds[Interval.second].push_back(Interval.first); 5288 5289 SmallPtrSet<Instruction *, 8> OpenIntervals; 5290 5291 // Get the size of the widest register. 5292 unsigned MaxSafeDepDist = -1U; 5293 if (Legal->getMaxSafeDepDistBytes() != -1U) 5294 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5295 unsigned WidestRegister = 5296 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5297 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5298 5299 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5300 SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0); 5301 5302 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5303 5304 // A lambda that gets the register usage for the given type and VF. 5305 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5306 if (Ty->isTokenTy()) 5307 return 0U; 5308 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5309 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5310 }; 5311 5312 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5313 Instruction *I = IdxToInstr[i]; 5314 5315 // Remove all of the instructions that end at this location. 5316 InstrList &List = TransposeEnds[i]; 5317 for (Instruction *ToRemove : List) 5318 OpenIntervals.erase(ToRemove); 5319 5320 // Ignore instructions that are never used within the loop. 5321 if (Ends.find(I) == Ends.end()) 5322 continue; 5323 5324 // Skip ignored values. 5325 if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) 5326 continue; 5327 5328 // For each VF find the maximum usage of registers. 5329 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5330 if (VFs[j] == 1) { 5331 MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); 5332 continue; 5333 } 5334 collectUniformsAndScalars(VFs[j]); 5335 // Count the number of live intervals. 5336 unsigned RegUsage = 0; 5337 for (auto Inst : OpenIntervals) { 5338 // Skip ignored values for VF > 1. 5339 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() || 5340 isScalarAfterVectorization(Inst, VFs[j])) 5341 continue; 5342 RegUsage += GetRegUsage(Inst->getType(), VFs[j]); 5343 } 5344 MaxUsages[j] = std::max(MaxUsages[j], RegUsage); 5345 } 5346 5347 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5348 << OpenIntervals.size() << '\n'); 5349 5350 // Add the current instruction to the list of open intervals. 5351 OpenIntervals.insert(I); 5352 } 5353 5354 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5355 unsigned Invariant = 0; 5356 if (VFs[i] == 1) 5357 Invariant = LoopInvariants.size(); 5358 else { 5359 for (auto Inst : LoopInvariants) 5360 Invariant += GetRegUsage(Inst->getType(), VFs[i]); 5361 } 5362 5363 LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); 5364 LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); 5365 LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant 5366 << '\n'); 5367 5368 RU.LoopInvariantRegs = Invariant; 5369 RU.MaxLocalUsers = MaxUsages[i]; 5370 RUs[i] = RU; 5371 } 5372 5373 return RUs; 5374 } 5375 5376 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5377 // TODO: Cost model for emulated masked load/store is completely 5378 // broken. This hack guides the cost model to use an artificially 5379 // high enough value to practically disable vectorization with such 5380 // operations, except where previously deployed legality hack allowed 5381 // using very low cost values. This is to avoid regressions coming simply 5382 // from moving "masked load/store" check from legality to cost model. 5383 // Masked Load/Gather emulation was previously never allowed. 5384 // Limited number of Masked Store/Scatter emulation was allowed. 5385 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5386 return isa<LoadInst>(I) || 5387 (isa<StoreInst>(I) && 5388 NumPredStores > NumberOfStoresToPredicate); 5389 } 5390 5391 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5392 // If we aren't vectorizing the loop, or if we've already collected the 5393 // instructions to scalarize, there's nothing to do. Collection may already 5394 // have occurred if we have a user-selected VF and are now computing the 5395 // expected cost for interleaving. 5396 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5397 return; 5398 5399 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5400 // not profitable to scalarize any instructions, the presence of VF in the 5401 // map will indicate that we've analyzed it already. 5402 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5403 5404 // Find all the instructions that are scalar with predication in the loop and 5405 // determine if it would be better to not if-convert the blocks they are in. 5406 // If so, we also record the instructions to scalarize. 5407 for (BasicBlock *BB : TheLoop->blocks()) { 5408 if (!blockNeedsPredication(BB)) 5409 continue; 5410 for (Instruction &I : *BB) 5411 if (isScalarWithPredication(&I)) { 5412 ScalarCostsTy ScalarCosts; 5413 // Do not apply discount logic if hacked cost is needed 5414 // for emulated masked memrefs. 5415 if (!useEmulatedMaskMemRefHack(&I) && 5416 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5417 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5418 // Remember that BB will remain after vectorization. 5419 PredicatedBBsAfterVectorization.insert(BB); 5420 } 5421 } 5422 } 5423 5424 int LoopVectorizationCostModel::computePredInstDiscount( 5425 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5426 unsigned VF) { 5427 assert(!isUniformAfterVectorization(PredInst, VF) && 5428 "Instruction marked uniform-after-vectorization will be predicated"); 5429 5430 // Initialize the discount to zero, meaning that the scalar version and the 5431 // vector version cost the same. 5432 int Discount = 0; 5433 5434 // Holds instructions to analyze. The instructions we visit are mapped in 5435 // ScalarCosts. Those instructions are the ones that would be scalarized if 5436 // we find that the scalar version costs less. 5437 SmallVector<Instruction *, 8> Worklist; 5438 5439 // Returns true if the given instruction can be scalarized. 5440 auto canBeScalarized = [&](Instruction *I) -> bool { 5441 // We only attempt to scalarize instructions forming a single-use chain 5442 // from the original predicated block that would otherwise be vectorized. 5443 // Although not strictly necessary, we give up on instructions we know will 5444 // already be scalar to avoid traversing chains that are unlikely to be 5445 // beneficial. 5446 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5447 isScalarAfterVectorization(I, VF)) 5448 return false; 5449 5450 // If the instruction is scalar with predication, it will be analyzed 5451 // separately. We ignore it within the context of PredInst. 5452 if (isScalarWithPredication(I)) 5453 return false; 5454 5455 // If any of the instruction's operands are uniform after vectorization, 5456 // the instruction cannot be scalarized. This prevents, for example, a 5457 // masked load from being scalarized. 5458 // 5459 // We assume we will only emit a value for lane zero of an instruction 5460 // marked uniform after vectorization, rather than VF identical values. 5461 // Thus, if we scalarize an instruction that uses a uniform, we would 5462 // create uses of values corresponding to the lanes we aren't emitting code 5463 // for. This behavior can be changed by allowing getScalarValue to clone 5464 // the lane zero values for uniforms rather than asserting. 5465 for (Use &U : I->operands()) 5466 if (auto *J = dyn_cast<Instruction>(U.get())) 5467 if (isUniformAfterVectorization(J, VF)) 5468 return false; 5469 5470 // Otherwise, we can scalarize the instruction. 5471 return true; 5472 }; 5473 5474 // Compute the expected cost discount from scalarizing the entire expression 5475 // feeding the predicated instruction. We currently only consider expressions 5476 // that are single-use instruction chains. 5477 Worklist.push_back(PredInst); 5478 while (!Worklist.empty()) { 5479 Instruction *I = Worklist.pop_back_val(); 5480 5481 // If we've already analyzed the instruction, there's nothing to do. 5482 if (ScalarCosts.find(I) != ScalarCosts.end()) 5483 continue; 5484 5485 // Compute the cost of the vector instruction. Note that this cost already 5486 // includes the scalarization overhead of the predicated instruction. 5487 unsigned VectorCost = getInstructionCost(I, VF).first; 5488 5489 // Compute the cost of the scalarized instruction. This cost is the cost of 5490 // the instruction as if it wasn't if-converted and instead remained in the 5491 // predicated block. We will scale this cost by block probability after 5492 // computing the scalarization overhead. 5493 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5494 5495 // Compute the scalarization overhead of needed insertelement instructions 5496 // and phi nodes. 5497 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5498 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), 5499 true, false); 5500 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); 5501 } 5502 5503 // Compute the scalarization overhead of needed extractelement 5504 // instructions. For each of the instruction's operands, if the operand can 5505 // be scalarized, add it to the worklist; otherwise, account for the 5506 // overhead. 5507 for (Use &U : I->operands()) 5508 if (auto *J = dyn_cast<Instruction>(U.get())) { 5509 assert(VectorType::isValidElementType(J->getType()) && 5510 "Instruction has non-scalar type"); 5511 if (canBeScalarized(J)) 5512 Worklist.push_back(J); 5513 else if (needsExtract(J, VF)) 5514 ScalarCost += TTI.getScalarizationOverhead( 5515 ToVectorTy(J->getType(),VF), false, true); 5516 } 5517 5518 // Scale the total scalar cost by block probability. 5519 ScalarCost /= getReciprocalPredBlockProb(); 5520 5521 // Compute the discount. A non-negative discount means the vector version 5522 // of the instruction costs more, and scalarizing would be beneficial. 5523 Discount += VectorCost - ScalarCost; 5524 ScalarCosts[I] = ScalarCost; 5525 } 5526 5527 return Discount; 5528 } 5529 5530 LoopVectorizationCostModel::VectorizationCostTy 5531 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5532 VectorizationCostTy Cost; 5533 5534 // For each block. 5535 for (BasicBlock *BB : TheLoop->blocks()) { 5536 VectorizationCostTy BlockCost; 5537 5538 // For each instruction in the old loop. 5539 for (Instruction &I : BB->instructionsWithoutDebug()) { 5540 // Skip ignored values. 5541 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || 5542 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) 5543 continue; 5544 5545 VectorizationCostTy C = getInstructionCost(&I, VF); 5546 5547 // Check if we should override the cost. 5548 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5549 C.first = ForceTargetInstructionCost; 5550 5551 BlockCost.first += C.first; 5552 BlockCost.second |= C.second; 5553 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5554 << " for VF " << VF << " For instruction: " << I 5555 << '\n'); 5556 } 5557 5558 // If we are vectorizing a predicated block, it will have been 5559 // if-converted. This means that the block's instructions (aside from 5560 // stores and instructions that may divide by zero) will now be 5561 // unconditionally executed. For the scalar case, we may not always execute 5562 // the predicated block. Thus, scale the block's cost by the probability of 5563 // executing it. 5564 if (VF == 1 && blockNeedsPredication(BB)) 5565 BlockCost.first /= getReciprocalPredBlockProb(); 5566 5567 Cost.first += BlockCost.first; 5568 Cost.second |= BlockCost.second; 5569 } 5570 5571 return Cost; 5572 } 5573 5574 /// Gets Address Access SCEV after verifying that the access pattern 5575 /// is loop invariant except the induction variable dependence. 5576 /// 5577 /// This SCEV can be sent to the Target in order to estimate the address 5578 /// calculation cost. 5579 static const SCEV *getAddressAccessSCEV( 5580 Value *Ptr, 5581 LoopVectorizationLegality *Legal, 5582 PredicatedScalarEvolution &PSE, 5583 const Loop *TheLoop) { 5584 5585 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5586 if (!Gep) 5587 return nullptr; 5588 5589 // We are looking for a gep with all loop invariant indices except for one 5590 // which should be an induction variable. 5591 auto SE = PSE.getSE(); 5592 unsigned NumOperands = Gep->getNumOperands(); 5593 for (unsigned i = 1; i < NumOperands; ++i) { 5594 Value *Opd = Gep->getOperand(i); 5595 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5596 !Legal->isInductionVariable(Opd)) 5597 return nullptr; 5598 } 5599 5600 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5601 return PSE.getSCEV(Ptr); 5602 } 5603 5604 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5605 return Legal->hasStride(I->getOperand(0)) || 5606 Legal->hasStride(I->getOperand(1)); 5607 } 5608 5609 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5610 unsigned VF) { 5611 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5612 Type *ValTy = getMemInstValueType(I); 5613 auto SE = PSE.getSE(); 5614 5615 unsigned Alignment = getLoadStoreAlignment(I); 5616 unsigned AS = getLoadStoreAddressSpace(I); 5617 Value *Ptr = getLoadStorePointerOperand(I); 5618 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5619 5620 // Figure out whether the access is strided and get the stride value 5621 // if it's known in compile time 5622 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5623 5624 // Get the cost of the scalar memory instruction and address computation. 5625 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5626 5627 // Don't pass *I here, since it is scalar but will actually be part of a 5628 // vectorized loop where the user of it is a vectorized instruction. 5629 Cost += VF * 5630 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 5631 AS); 5632 5633 // Get the overhead of the extractelement and insertelement instructions 5634 // we might create due to scalarization. 5635 Cost += getScalarizationOverhead(I, VF); 5636 5637 // If we have a predicated store, it may not be executed for each vector 5638 // lane. Scale the cost by the probability of executing the predicated 5639 // block. 5640 if (isPredicatedInst(I)) { 5641 Cost /= getReciprocalPredBlockProb(); 5642 5643 if (useEmulatedMaskMemRefHack(I)) 5644 // Artificially setting to a high enough value to practically disable 5645 // vectorization with such operations. 5646 Cost = 3000000; 5647 } 5648 5649 return Cost; 5650 } 5651 5652 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5653 unsigned VF) { 5654 Type *ValTy = getMemInstValueType(I); 5655 Type *VectorTy = ToVectorTy(ValTy, VF); 5656 unsigned Alignment = getLoadStoreAlignment(I); 5657 Value *Ptr = getLoadStorePointerOperand(I); 5658 unsigned AS = getLoadStoreAddressSpace(I); 5659 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5660 5661 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5662 "Stride should be 1 or -1 for consecutive memory access"); 5663 unsigned Cost = 0; 5664 if (Legal->isMaskRequired(I)) 5665 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 5666 else 5667 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); 5668 5669 bool Reverse = ConsecutiveStride < 0; 5670 if (Reverse) 5671 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5672 return Cost; 5673 } 5674 5675 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5676 unsigned VF) { 5677 Type *ValTy = getMemInstValueType(I); 5678 Type *VectorTy = ToVectorTy(ValTy, VF); 5679 unsigned Alignment = getLoadStoreAlignment(I); 5680 unsigned AS = getLoadStoreAddressSpace(I); 5681 if (isa<LoadInst>(I)) { 5682 return TTI.getAddressComputationCost(ValTy) + 5683 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + 5684 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5685 } 5686 StoreInst *SI = cast<StoreInst>(I); 5687 5688 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5689 return TTI.getAddressComputationCost(ValTy) + 5690 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + 5691 (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost( 5692 Instruction::ExtractElement, 5693 VectorTy, VF - 1)); 5694 } 5695 5696 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5697 unsigned VF) { 5698 Type *ValTy = getMemInstValueType(I); 5699 Type *VectorTy = ToVectorTy(ValTy, VF); 5700 unsigned Alignment = getLoadStoreAlignment(I); 5701 Value *Ptr = getLoadStorePointerOperand(I); 5702 5703 return TTI.getAddressComputationCost(VectorTy) + 5704 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5705 Legal->isMaskRequired(I), Alignment); 5706 } 5707 5708 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5709 unsigned VF) { 5710 Type *ValTy = getMemInstValueType(I); 5711 Type *VectorTy = ToVectorTy(ValTy, VF); 5712 unsigned AS = getLoadStoreAddressSpace(I); 5713 5714 auto Group = getInterleavedAccessGroup(I); 5715 assert(Group && "Fail to get an interleaved access group."); 5716 5717 unsigned InterleaveFactor = Group->getFactor(); 5718 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5719 5720 // Holds the indices of existing members in an interleaved load group. 5721 // An interleaved store group doesn't need this as it doesn't allow gaps. 5722 SmallVector<unsigned, 4> Indices; 5723 if (isa<LoadInst>(I)) { 5724 for (unsigned i = 0; i < InterleaveFactor; i++) 5725 if (Group->getMember(i)) 5726 Indices.push_back(i); 5727 } 5728 5729 // Calculate the cost of the whole interleaved group. 5730 bool UseMaskForGaps = 5731 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5732 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5733 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5734 Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); 5735 5736 if (Group->isReverse()) { 5737 // TODO: Add support for reversed masked interleaved access. 5738 assert(!Legal->isMaskRequired(I) && 5739 "Reverse masked interleaved access not supported."); 5740 Cost += Group->getNumMembers() * 5741 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5742 } 5743 return Cost; 5744 } 5745 5746 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5747 unsigned VF) { 5748 // Calculate scalar cost only. Vectorization cost should be ready at this 5749 // moment. 5750 if (VF == 1) { 5751 Type *ValTy = getMemInstValueType(I); 5752 unsigned Alignment = getLoadStoreAlignment(I); 5753 unsigned AS = getLoadStoreAddressSpace(I); 5754 5755 return TTI.getAddressComputationCost(ValTy) + 5756 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); 5757 } 5758 return getWideningCost(I, VF); 5759 } 5760 5761 LoopVectorizationCostModel::VectorizationCostTy 5762 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5763 // If we know that this instruction will remain uniform, check the cost of 5764 // the scalar version. 5765 if (isUniformAfterVectorization(I, VF)) 5766 VF = 1; 5767 5768 if (VF > 1 && isProfitableToScalarize(I, VF)) 5769 return VectorizationCostTy(InstsToScalarize[VF][I], false); 5770 5771 // Forced scalars do not have any scalarization overhead. 5772 auto ForcedScalar = ForcedScalars.find(VF); 5773 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 5774 auto InstSet = ForcedScalar->second; 5775 if (InstSet.find(I) != InstSet.end()) 5776 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 5777 } 5778 5779 Type *VectorTy; 5780 unsigned C = getInstructionCost(I, VF, VectorTy); 5781 5782 bool TypeNotScalarized = 5783 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 5784 return VectorizationCostTy(C, TypeNotScalarized); 5785 } 5786 5787 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 5788 unsigned VF) { 5789 5790 if (VF == 1) 5791 return 0; 5792 5793 unsigned Cost = 0; 5794 Type *RetTy = ToVectorTy(I->getType(), VF); 5795 if (!RetTy->isVoidTy() && 5796 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 5797 Cost += TTI.getScalarizationOverhead(RetTy, true, false); 5798 5799 // Some targets keep addresses scalar. 5800 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 5801 return Cost; 5802 5803 // Some targets support efficient element stores. 5804 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 5805 return Cost; 5806 5807 // Collect operands to consider. 5808 CallInst *CI = dyn_cast<CallInst>(I); 5809 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 5810 5811 // Skip operands that do not require extraction/scalarization and do not incur 5812 // any overhead. 5813 return Cost + TTI.getOperandsScalarizationOverhead( 5814 filterExtractingOperands(Ops, VF), VF); 5815 } 5816 5817 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 5818 if (VF == 1) 5819 return; 5820 NumPredStores = 0; 5821 for (BasicBlock *BB : TheLoop->blocks()) { 5822 // For each instruction in the old loop. 5823 for (Instruction &I : *BB) { 5824 Value *Ptr = getLoadStorePointerOperand(&I); 5825 if (!Ptr) 5826 continue; 5827 5828 // TODO: We should generate better code and update the cost model for 5829 // predicated uniform stores. Today they are treated as any other 5830 // predicated store (see added test cases in 5831 // invariant-store-vectorization.ll). 5832 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 5833 NumPredStores++; 5834 5835 if (Legal->isUniform(Ptr) && 5836 // Conditional loads and stores should be scalarized and predicated. 5837 // isScalarWithPredication cannot be used here since masked 5838 // gather/scatters are not considered scalar with predication. 5839 !Legal->blockNeedsPredication(I.getParent())) { 5840 // TODO: Avoid replicating loads and stores instead of 5841 // relying on instcombine to remove them. 5842 // Load: Scalar load + broadcast 5843 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 5844 unsigned Cost = getUniformMemOpCost(&I, VF); 5845 setWideningDecision(&I, VF, CM_Scalarize, Cost); 5846 continue; 5847 } 5848 5849 // We assume that widening is the best solution when possible. 5850 if (memoryInstructionCanBeWidened(&I, VF)) { 5851 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 5852 int ConsecutiveStride = 5853 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 5854 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5855 "Expected consecutive stride."); 5856 InstWidening Decision = 5857 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 5858 setWideningDecision(&I, VF, Decision, Cost); 5859 continue; 5860 } 5861 5862 // Choose between Interleaving, Gather/Scatter or Scalarization. 5863 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 5864 unsigned NumAccesses = 1; 5865 if (isAccessInterleaved(&I)) { 5866 auto Group = getInterleavedAccessGroup(&I); 5867 assert(Group && "Fail to get an interleaved access group."); 5868 5869 // Make one decision for the whole group. 5870 if (getWideningDecision(&I, VF) != CM_Unknown) 5871 continue; 5872 5873 NumAccesses = Group->getNumMembers(); 5874 if (interleavedAccessCanBeWidened(&I, VF)) 5875 InterleaveCost = getInterleaveGroupCost(&I, VF); 5876 } 5877 5878 unsigned GatherScatterCost = 5879 isLegalGatherOrScatter(&I) 5880 ? getGatherScatterCost(&I, VF) * NumAccesses 5881 : std::numeric_limits<unsigned>::max(); 5882 5883 unsigned ScalarizationCost = 5884 getMemInstScalarizationCost(&I, VF) * NumAccesses; 5885 5886 // Choose better solution for the current VF, 5887 // write down this decision and use it during vectorization. 5888 unsigned Cost; 5889 InstWidening Decision; 5890 if (InterleaveCost <= GatherScatterCost && 5891 InterleaveCost < ScalarizationCost) { 5892 Decision = CM_Interleave; 5893 Cost = InterleaveCost; 5894 } else if (GatherScatterCost < ScalarizationCost) { 5895 Decision = CM_GatherScatter; 5896 Cost = GatherScatterCost; 5897 } else { 5898 Decision = CM_Scalarize; 5899 Cost = ScalarizationCost; 5900 } 5901 // If the instructions belongs to an interleave group, the whole group 5902 // receives the same decision. The whole group receives the cost, but 5903 // the cost will actually be assigned to one instruction. 5904 if (auto Group = getInterleavedAccessGroup(&I)) 5905 setWideningDecision(Group, VF, Decision, Cost); 5906 else 5907 setWideningDecision(&I, VF, Decision, Cost); 5908 } 5909 } 5910 5911 // Make sure that any load of address and any other address computation 5912 // remains scalar unless there is gather/scatter support. This avoids 5913 // inevitable extracts into address registers, and also has the benefit of 5914 // activating LSR more, since that pass can't optimize vectorized 5915 // addresses. 5916 if (TTI.prefersVectorizedAddressing()) 5917 return; 5918 5919 // Start with all scalar pointer uses. 5920 SmallPtrSet<Instruction *, 8> AddrDefs; 5921 for (BasicBlock *BB : TheLoop->blocks()) 5922 for (Instruction &I : *BB) { 5923 Instruction *PtrDef = 5924 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 5925 if (PtrDef && TheLoop->contains(PtrDef) && 5926 getWideningDecision(&I, VF) != CM_GatherScatter) 5927 AddrDefs.insert(PtrDef); 5928 } 5929 5930 // Add all instructions used to generate the addresses. 5931 SmallVector<Instruction *, 4> Worklist; 5932 for (auto *I : AddrDefs) 5933 Worklist.push_back(I); 5934 while (!Worklist.empty()) { 5935 Instruction *I = Worklist.pop_back_val(); 5936 for (auto &Op : I->operands()) 5937 if (auto *InstOp = dyn_cast<Instruction>(Op)) 5938 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 5939 AddrDefs.insert(InstOp).second) 5940 Worklist.push_back(InstOp); 5941 } 5942 5943 for (auto *I : AddrDefs) { 5944 if (isa<LoadInst>(I)) { 5945 // Setting the desired widening decision should ideally be handled in 5946 // by cost functions, but since this involves the task of finding out 5947 // if the loaded register is involved in an address computation, it is 5948 // instead changed here when we know this is the case. 5949 InstWidening Decision = getWideningDecision(I, VF); 5950 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 5951 // Scalarize a widened load of address. 5952 setWideningDecision(I, VF, CM_Scalarize, 5953 (VF * getMemoryInstructionCost(I, 1))); 5954 else if (auto Group = getInterleavedAccessGroup(I)) { 5955 // Scalarize an interleave group of address loads. 5956 for (unsigned I = 0; I < Group->getFactor(); ++I) { 5957 if (Instruction *Member = Group->getMember(I)) 5958 setWideningDecision(Member, VF, CM_Scalarize, 5959 (VF * getMemoryInstructionCost(Member, 1))); 5960 } 5961 } 5962 } else 5963 // Make sure I gets scalarized and a cost estimate without 5964 // scalarization overhead. 5965 ForcedScalars[VF].insert(I); 5966 } 5967 } 5968 5969 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 5970 unsigned VF, 5971 Type *&VectorTy) { 5972 Type *RetTy = I->getType(); 5973 if (canTruncateToMinimalBitwidth(I, VF)) 5974 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 5975 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 5976 auto SE = PSE.getSE(); 5977 5978 // TODO: We need to estimate the cost of intrinsic calls. 5979 switch (I->getOpcode()) { 5980 case Instruction::GetElementPtr: 5981 // We mark this instruction as zero-cost because the cost of GEPs in 5982 // vectorized code depends on whether the corresponding memory instruction 5983 // is scalarized or not. Therefore, we handle GEPs with the memory 5984 // instruction cost. 5985 return 0; 5986 case Instruction::Br: { 5987 // In cases of scalarized and predicated instructions, there will be VF 5988 // predicated blocks in the vectorized loop. Each branch around these 5989 // blocks requires also an extract of its vector compare i1 element. 5990 bool ScalarPredicatedBB = false; 5991 BranchInst *BI = cast<BranchInst>(I); 5992 if (VF > 1 && BI->isConditional() && 5993 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != 5994 PredicatedBBsAfterVectorization.end() || 5995 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != 5996 PredicatedBBsAfterVectorization.end())) 5997 ScalarPredicatedBB = true; 5998 5999 if (ScalarPredicatedBB) { 6000 // Return cost for branches around scalarized and predicated blocks. 6001 Type *Vec_i1Ty = 6002 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6003 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + 6004 (TTI.getCFInstrCost(Instruction::Br) * VF)); 6005 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6006 // The back-edge branch will remain, as will all scalar branches. 6007 return TTI.getCFInstrCost(Instruction::Br); 6008 else 6009 // This branch will be eliminated by if-conversion. 6010 return 0; 6011 // Note: We currently assume zero cost for an unconditional branch inside 6012 // a predicated block since it will become a fall-through, although we 6013 // may decide in the future to call TTI for all branches. 6014 } 6015 case Instruction::PHI: { 6016 auto *Phi = cast<PHINode>(I); 6017 6018 // First-order recurrences are replaced by vector shuffles inside the loop. 6019 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6020 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6021 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6022 VectorTy, VF - 1, VectorType::get(RetTy, 1)); 6023 6024 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6025 // converted into select instructions. We require N - 1 selects per phi 6026 // node, where N is the number of incoming values. 6027 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6028 return (Phi->getNumIncomingValues() - 1) * 6029 TTI.getCmpSelInstrCost( 6030 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6031 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); 6032 6033 return TTI.getCFInstrCost(Instruction::PHI); 6034 } 6035 case Instruction::UDiv: 6036 case Instruction::SDiv: 6037 case Instruction::URem: 6038 case Instruction::SRem: 6039 // If we have a predicated instruction, it may not be executed for each 6040 // vector lane. Get the scalarization cost and scale this amount by the 6041 // probability of executing the predicated block. If the instruction is not 6042 // predicated, we fall through to the next case. 6043 if (VF > 1 && isScalarWithPredication(I)) { 6044 unsigned Cost = 0; 6045 6046 // These instructions have a non-void type, so account for the phi nodes 6047 // that we will create. This cost is likely to be zero. The phi node 6048 // cost, if any, should be scaled by the block probability because it 6049 // models a copy at the end of each predicated block. 6050 Cost += VF * TTI.getCFInstrCost(Instruction::PHI); 6051 6052 // The cost of the non-predicated instruction. 6053 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); 6054 6055 // The cost of insertelement and extractelement instructions needed for 6056 // scalarization. 6057 Cost += getScalarizationOverhead(I, VF); 6058 6059 // Scale the cost by the probability of executing the predicated blocks. 6060 // This assumes the predicated block for each vector lane is equally 6061 // likely. 6062 return Cost / getReciprocalPredBlockProb(); 6063 } 6064 LLVM_FALLTHROUGH; 6065 case Instruction::Add: 6066 case Instruction::FAdd: 6067 case Instruction::Sub: 6068 case Instruction::FSub: 6069 case Instruction::Mul: 6070 case Instruction::FMul: 6071 case Instruction::FDiv: 6072 case Instruction::FRem: 6073 case Instruction::Shl: 6074 case Instruction::LShr: 6075 case Instruction::AShr: 6076 case Instruction::And: 6077 case Instruction::Or: 6078 case Instruction::Xor: { 6079 // Since we will replace the stride by 1 the multiplication should go away. 6080 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6081 return 0; 6082 // Certain instructions can be cheaper to vectorize if they have a constant 6083 // second vector operand. One example of this are shifts on x86. 6084 Value *Op2 = I->getOperand(1); 6085 TargetTransformInfo::OperandValueProperties Op2VP; 6086 TargetTransformInfo::OperandValueKind Op2VK = 6087 TTI.getOperandInfo(Op2, Op2VP); 6088 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6089 Op2VK = TargetTransformInfo::OK_UniformValue; 6090 6091 SmallVector<const Value *, 4> Operands(I->operand_values()); 6092 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6093 return N * TTI.getArithmeticInstrCost( 6094 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6095 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands); 6096 } 6097 case Instruction::FNeg: { 6098 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6099 return N * TTI.getArithmeticInstrCost( 6100 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6101 TargetTransformInfo::OK_AnyValue, 6102 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6103 I->getOperand(0)); 6104 } 6105 case Instruction::Select: { 6106 SelectInst *SI = cast<SelectInst>(I); 6107 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6108 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6109 Type *CondTy = SI->getCondition()->getType(); 6110 if (!ScalarCond) 6111 CondTy = VectorType::get(CondTy, VF); 6112 6113 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); 6114 } 6115 case Instruction::ICmp: 6116 case Instruction::FCmp: { 6117 Type *ValTy = I->getOperand(0)->getType(); 6118 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6119 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6120 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6121 VectorTy = ToVectorTy(ValTy, VF); 6122 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); 6123 } 6124 case Instruction::Store: 6125 case Instruction::Load: { 6126 unsigned Width = VF; 6127 if (Width > 1) { 6128 InstWidening Decision = getWideningDecision(I, Width); 6129 assert(Decision != CM_Unknown && 6130 "CM decision should be taken at this point"); 6131 if (Decision == CM_Scalarize) 6132 Width = 1; 6133 } 6134 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6135 return getMemoryInstructionCost(I, VF); 6136 } 6137 case Instruction::ZExt: 6138 case Instruction::SExt: 6139 case Instruction::FPToUI: 6140 case Instruction::FPToSI: 6141 case Instruction::FPExt: 6142 case Instruction::PtrToInt: 6143 case Instruction::IntToPtr: 6144 case Instruction::SIToFP: 6145 case Instruction::UIToFP: 6146 case Instruction::Trunc: 6147 case Instruction::FPTrunc: 6148 case Instruction::BitCast: { 6149 // We optimize the truncation of induction variables having constant 6150 // integer steps. The cost of these truncations is the same as the scalar 6151 // operation. 6152 if (isOptimizableIVTruncate(I, VF)) { 6153 auto *Trunc = cast<TruncInst>(I); 6154 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6155 Trunc->getSrcTy(), Trunc); 6156 } 6157 6158 Type *SrcScalarTy = I->getOperand(0)->getType(); 6159 Type *SrcVecTy = 6160 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6161 if (canTruncateToMinimalBitwidth(I, VF)) { 6162 // This cast is going to be shrunk. This may remove the cast or it might 6163 // turn it into slightly different cast. For example, if MinBW == 16, 6164 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6165 // 6166 // Calculate the modified src and dest types. 6167 Type *MinVecTy = VectorTy; 6168 if (I->getOpcode() == Instruction::Trunc) { 6169 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6170 VectorTy = 6171 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6172 } else if (I->getOpcode() == Instruction::ZExt || 6173 I->getOpcode() == Instruction::SExt) { 6174 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6175 VectorTy = 6176 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6177 } 6178 } 6179 6180 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6181 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); 6182 } 6183 case Instruction::Call: { 6184 bool NeedToScalarize; 6185 CallInst *CI = cast<CallInst>(I); 6186 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6187 if (getVectorIntrinsicIDForCall(CI, TLI)) 6188 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6189 return CallCost; 6190 } 6191 default: 6192 // The cost of executing VF copies of the scalar instruction. This opcode 6193 // is unknown. Assume that it is the same as 'mul'. 6194 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + 6195 getScalarizationOverhead(I, VF); 6196 } // end of switch. 6197 } 6198 6199 char LoopVectorize::ID = 0; 6200 6201 static const char lv_name[] = "Loop Vectorization"; 6202 6203 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6204 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6205 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6206 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6207 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6208 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6209 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6210 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6211 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6212 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6213 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6214 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6215 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6216 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6217 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6218 6219 namespace llvm { 6220 6221 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6222 6223 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6224 bool VectorizeOnlyWhenForced) { 6225 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6226 } 6227 6228 } // end namespace llvm 6229 6230 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6231 // Check if the pointer operand of a load or store instruction is 6232 // consecutive. 6233 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6234 return Legal->isConsecutivePtr(Ptr); 6235 return false; 6236 } 6237 6238 void LoopVectorizationCostModel::collectValuesToIgnore() { 6239 // Ignore ephemeral values. 6240 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6241 6242 // Ignore type-promoting instructions we identified during reduction 6243 // detection. 6244 for (auto &Reduction : *Legal->getReductionVars()) { 6245 RecurrenceDescriptor &RedDes = Reduction.second; 6246 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6247 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6248 } 6249 // Ignore type-casting instructions we identified during induction 6250 // detection. 6251 for (auto &Induction : *Legal->getInductionVars()) { 6252 InductionDescriptor &IndDes = Induction.second; 6253 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6254 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6255 } 6256 } 6257 6258 // TODO: we could return a pair of values that specify the max VF and 6259 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6260 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6261 // doesn't have a cost model that can choose which plan to execute if 6262 // more than one is generated. 6263 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6264 LoopVectorizationCostModel &CM) { 6265 unsigned WidestType; 6266 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6267 return WidestVectorRegBits / WidestType; 6268 } 6269 6270 VectorizationFactor 6271 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6272 unsigned VF = UserVF; 6273 // Outer loop handling: They may require CFG and instruction level 6274 // transformations before even evaluating whether vectorization is profitable. 6275 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6276 // the vectorization pipeline. 6277 if (!OrigLoop->empty()) { 6278 // If the user doesn't provide a vectorization factor, determine a 6279 // reasonable one. 6280 if (!UserVF) { 6281 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6282 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6283 6284 // Make sure we have a VF > 1 for stress testing. 6285 if (VPlanBuildStressTest && VF < 2) { 6286 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6287 << "overriding computed VF.\n"); 6288 VF = 4; 6289 } 6290 } 6291 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6292 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6293 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6294 << " to build VPlans.\n"); 6295 buildVPlans(VF, VF); 6296 6297 // For VPlan build stress testing, we bail out after VPlan construction. 6298 if (VPlanBuildStressTest) 6299 return VectorizationFactor::Disabled(); 6300 6301 return {VF, 0}; 6302 } 6303 6304 LLVM_DEBUG( 6305 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6306 "VPlan-native path.\n"); 6307 return VectorizationFactor::Disabled(); 6308 } 6309 6310 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { 6311 assert(OrigLoop->empty() && "Inner loop expected."); 6312 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); 6313 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6314 return None; 6315 6316 // Invalidate interleave groups if all blocks of loop will be predicated. 6317 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6318 !useMaskedInterleavedAccesses(*TTI)) { 6319 LLVM_DEBUG( 6320 dbgs() 6321 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6322 "which requires masked-interleaved support.\n"); 6323 CM.InterleaveInfo.reset(); 6324 } 6325 6326 if (UserVF) { 6327 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6328 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6329 // Collect the instructions (and their associated costs) that will be more 6330 // profitable to scalarize. 6331 CM.selectUserVectorizationFactor(UserVF); 6332 buildVPlansWithVPRecipes(UserVF, UserVF); 6333 LLVM_DEBUG(printPlans(dbgs())); 6334 return {{UserVF, 0}}; 6335 } 6336 6337 unsigned MaxVF = MaybeMaxVF.getValue(); 6338 assert(MaxVF != 0 && "MaxVF is zero."); 6339 6340 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6341 // Collect Uniform and Scalar instructions after vectorization with VF. 6342 CM.collectUniformsAndScalars(VF); 6343 6344 // Collect the instructions (and their associated costs) that will be more 6345 // profitable to scalarize. 6346 if (VF > 1) 6347 CM.collectInstsToScalarize(VF); 6348 } 6349 6350 buildVPlansWithVPRecipes(1, MaxVF); 6351 LLVM_DEBUG(printPlans(dbgs())); 6352 if (MaxVF == 1) 6353 return VectorizationFactor::Disabled(); 6354 6355 // Select the optimal vectorization factor. 6356 return CM.selectVectorizationFactor(MaxVF); 6357 } 6358 6359 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6360 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6361 << '\n'); 6362 BestVF = VF; 6363 BestUF = UF; 6364 6365 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6366 return !Plan->hasVF(VF); 6367 }); 6368 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6369 } 6370 6371 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6372 DominatorTree *DT) { 6373 // Perform the actual loop transformation. 6374 6375 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6376 VPCallbackILV CallbackILV(ILV); 6377 6378 VPTransformState State{BestVF, BestUF, LI, 6379 DT, ILV.Builder, ILV.VectorLoopValueMap, 6380 &ILV, CallbackILV}; 6381 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6382 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6383 6384 //===------------------------------------------------===// 6385 // 6386 // Notice: any optimization or new instruction that go 6387 // into the code below should also be implemented in 6388 // the cost-model. 6389 // 6390 //===------------------------------------------------===// 6391 6392 // 2. Copy and widen instructions from the old loop into the new loop. 6393 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6394 VPlans.front()->execute(&State); 6395 6396 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6397 // predication, updating analyses. 6398 ILV.fixVectorizedLoop(); 6399 } 6400 6401 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6402 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6403 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6404 6405 // We create new control-flow for the vectorized loop, so the original 6406 // condition will be dead after vectorization if it's only used by the 6407 // branch. 6408 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6409 if (Cmp && Cmp->hasOneUse()) 6410 DeadInstructions.insert(Cmp); 6411 6412 // We create new "steps" for induction variable updates to which the original 6413 // induction variables map. An original update instruction will be dead if 6414 // all its users except the induction variable are dead. 6415 for (auto &Induction : *Legal->getInductionVars()) { 6416 PHINode *Ind = Induction.first; 6417 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6418 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6419 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != 6420 DeadInstructions.end(); 6421 })) 6422 DeadInstructions.insert(IndUpdate); 6423 6424 // We record as "Dead" also the type-casting instructions we had identified 6425 // during induction analysis. We don't need any handling for them in the 6426 // vectorized loop because we have proven that, under a proper runtime 6427 // test guarding the vectorized loop, the value of the phi, and the casted 6428 // value of the phi, are the same. The last instruction in this casting chain 6429 // will get its scalar/vector/widened def from the scalar/vector/widened def 6430 // of the respective phi node. Any other casts in the induction def-use chain 6431 // have no other uses outside the phi update chain, and will be ignored. 6432 InductionDescriptor &IndDes = Induction.second; 6433 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6434 DeadInstructions.insert(Casts.begin(), Casts.end()); 6435 } 6436 } 6437 6438 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6439 6440 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6441 6442 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6443 Instruction::BinaryOps BinOp) { 6444 // When unrolling and the VF is 1, we only need to add a simple scalar. 6445 Type *Ty = Val->getType(); 6446 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6447 6448 if (Ty->isFloatingPointTy()) { 6449 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6450 6451 // Floating point operations had to be 'fast' to enable the unrolling. 6452 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6453 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6454 } 6455 Constant *C = ConstantInt::get(Ty, StartIdx); 6456 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6457 } 6458 6459 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6460 SmallVector<Metadata *, 4> MDs; 6461 // Reserve first location for self reference to the LoopID metadata node. 6462 MDs.push_back(nullptr); 6463 bool IsUnrollMetadata = false; 6464 MDNode *LoopID = L->getLoopID(); 6465 if (LoopID) { 6466 // First find existing loop unrolling disable metadata. 6467 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6468 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6469 if (MD) { 6470 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6471 IsUnrollMetadata = 6472 S && S->getString().startswith("llvm.loop.unroll.disable"); 6473 } 6474 MDs.push_back(LoopID->getOperand(i)); 6475 } 6476 } 6477 6478 if (!IsUnrollMetadata) { 6479 // Add runtime unroll disable metadata. 6480 LLVMContext &Context = L->getHeader()->getContext(); 6481 SmallVector<Metadata *, 1> DisableOperands; 6482 DisableOperands.push_back( 6483 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6484 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6485 MDs.push_back(DisableNode); 6486 MDNode *NewLoopID = MDNode::get(Context, MDs); 6487 // Set operand 0 to refer to the loop id itself. 6488 NewLoopID->replaceOperandWith(0, NewLoopID); 6489 L->setLoopID(NewLoopID); 6490 } 6491 } 6492 6493 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6494 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6495 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6496 bool PredicateAtRangeStart = Predicate(Range.Start); 6497 6498 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6499 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6500 Range.End = TmpVF; 6501 break; 6502 } 6503 6504 return PredicateAtRangeStart; 6505 } 6506 6507 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6508 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6509 /// of VF's starting at a given VF and extending it as much as possible. Each 6510 /// vectorization decision can potentially shorten this sub-range during 6511 /// buildVPlan(). 6512 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6513 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6514 VFRange SubRange = {VF, MaxVF + 1}; 6515 VPlans.push_back(buildVPlan(SubRange)); 6516 VF = SubRange.End; 6517 } 6518 } 6519 6520 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6521 VPlanPtr &Plan) { 6522 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6523 6524 // Look for cached value. 6525 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6526 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6527 if (ECEntryIt != EdgeMaskCache.end()) 6528 return ECEntryIt->second; 6529 6530 VPValue *SrcMask = createBlockInMask(Src, Plan); 6531 6532 // The terminator has to be a branch inst! 6533 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6534 assert(BI && "Unexpected terminator found"); 6535 6536 if (!BI->isConditional()) 6537 return EdgeMaskCache[Edge] = SrcMask; 6538 6539 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6540 assert(EdgeMask && "No Edge Mask found for condition"); 6541 6542 if (BI->getSuccessor(0) != Dst) 6543 EdgeMask = Builder.createNot(EdgeMask); 6544 6545 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6546 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6547 6548 return EdgeMaskCache[Edge] = EdgeMask; 6549 } 6550 6551 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6552 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6553 6554 // Look for cached value. 6555 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6556 if (BCEntryIt != BlockMaskCache.end()) 6557 return BCEntryIt->second; 6558 6559 // All-one mask is modelled as no-mask following the convention for masked 6560 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6561 VPValue *BlockMask = nullptr; 6562 6563 if (OrigLoop->getHeader() == BB) { 6564 if (!CM.blockNeedsPredication(BB)) 6565 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6566 6567 // Introduce the early-exit compare IV <= BTC to form header block mask. 6568 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6569 VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6570 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6571 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6572 return BlockMaskCache[BB] = BlockMask; 6573 } 6574 6575 // This is the block mask. We OR all incoming edges. 6576 for (auto *Predecessor : predecessors(BB)) { 6577 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6578 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6579 return BlockMaskCache[BB] = EdgeMask; 6580 6581 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6582 BlockMask = EdgeMask; 6583 continue; 6584 } 6585 6586 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6587 } 6588 6589 return BlockMaskCache[BB] = BlockMask; 6590 } 6591 6592 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, 6593 VFRange &Range, 6594 VPlanPtr &Plan) { 6595 const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I); 6596 if (!IG) 6597 return nullptr; 6598 6599 // Now check if IG is relevant for VF's in the given range. 6600 auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> { 6601 return [=](unsigned VF) -> bool { 6602 return (VF >= 2 && // Query is illegal for VF == 1 6603 CM.getWideningDecision(I, VF) == 6604 LoopVectorizationCostModel::CM_Interleave); 6605 }; 6606 }; 6607 if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range)) 6608 return nullptr; 6609 6610 // I is a member of an InterleaveGroup for VF's in the (possibly trimmed) 6611 // range. If it's the primary member of the IG construct a VPInterleaveRecipe. 6612 // Otherwise, it's an adjunct member of the IG, do not construct any Recipe. 6613 assert(I == IG->getInsertPos() && 6614 "Generating a recipe for an adjunct member of an interleave group"); 6615 6616 VPValue *Mask = nullptr; 6617 if (Legal->isMaskRequired(I)) 6618 Mask = createBlockInMask(I->getParent(), Plan); 6619 6620 return new VPInterleaveRecipe(IG, Mask); 6621 } 6622 6623 VPWidenMemoryInstructionRecipe * 6624 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6625 VPlanPtr &Plan) { 6626 if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) 6627 return nullptr; 6628 6629 auto willWiden = [&](unsigned VF) -> bool { 6630 if (VF == 1) 6631 return false; 6632 if (CM.isScalarAfterVectorization(I, VF) || 6633 CM.isProfitableToScalarize(I, VF)) 6634 return false; 6635 LoopVectorizationCostModel::InstWidening Decision = 6636 CM.getWideningDecision(I, VF); 6637 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6638 "CM decision should be taken at this point."); 6639 assert(Decision != LoopVectorizationCostModel::CM_Interleave && 6640 "Interleave memory opportunity should be caught earlier."); 6641 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6642 }; 6643 6644 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6645 return nullptr; 6646 6647 VPValue *Mask = nullptr; 6648 if (Legal->isMaskRequired(I)) 6649 Mask = createBlockInMask(I->getParent(), Plan); 6650 6651 return new VPWidenMemoryInstructionRecipe(*I, Mask); 6652 } 6653 6654 VPWidenIntOrFpInductionRecipe * 6655 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { 6656 if (PHINode *Phi = dyn_cast<PHINode>(I)) { 6657 // Check if this is an integer or fp induction. If so, build the recipe that 6658 // produces its scalar and vector values. 6659 InductionDescriptor II = Legal->getInductionVars()->lookup(Phi); 6660 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6661 II.getKind() == InductionDescriptor::IK_FpInduction) 6662 return new VPWidenIntOrFpInductionRecipe(Phi); 6663 6664 return nullptr; 6665 } 6666 6667 // Optimize the special case where the source is a constant integer 6668 // induction variable. Notice that we can only optimize the 'trunc' case 6669 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6670 // (c) other casts depend on pointer size. 6671 6672 // Determine whether \p K is a truncation based on an induction variable that 6673 // can be optimized. 6674 auto isOptimizableIVTruncate = 6675 [&](Instruction *K) -> std::function<bool(unsigned)> { 6676 return 6677 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6678 }; 6679 6680 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( 6681 isOptimizableIVTruncate(I), Range)) 6682 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6683 cast<TruncInst>(I)); 6684 return nullptr; 6685 } 6686 6687 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { 6688 PHINode *Phi = dyn_cast<PHINode>(I); 6689 if (!Phi || Phi->getParent() == OrigLoop->getHeader()) 6690 return nullptr; 6691 6692 // We know that all PHIs in non-header blocks are converted into selects, so 6693 // we don't have to worry about the insertion order and we can just use the 6694 // builder. At this point we generate the predication tree. There may be 6695 // duplications since this is a simple recursive scan, but future 6696 // optimizations will clean it up. 6697 6698 SmallVector<VPValue *, 2> Masks; 6699 unsigned NumIncoming = Phi->getNumIncomingValues(); 6700 for (unsigned In = 0; In < NumIncoming; In++) { 6701 VPValue *EdgeMask = 6702 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6703 assert((EdgeMask || NumIncoming == 1) && 6704 "Multiple predecessors with one having a full mask"); 6705 if (EdgeMask) 6706 Masks.push_back(EdgeMask); 6707 } 6708 return new VPBlendRecipe(Phi, Masks); 6709 } 6710 6711 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, 6712 VFRange &Range) { 6713 6714 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6715 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6716 6717 if (IsPredicated) 6718 return false; 6719 6720 auto IsVectorizableOpcode = [](unsigned Opcode) { 6721 switch (Opcode) { 6722 case Instruction::Add: 6723 case Instruction::And: 6724 case Instruction::AShr: 6725 case Instruction::BitCast: 6726 case Instruction::Br: 6727 case Instruction::Call: 6728 case Instruction::FAdd: 6729 case Instruction::FCmp: 6730 case Instruction::FDiv: 6731 case Instruction::FMul: 6732 case Instruction::FNeg: 6733 case Instruction::FPExt: 6734 case Instruction::FPToSI: 6735 case Instruction::FPToUI: 6736 case Instruction::FPTrunc: 6737 case Instruction::FRem: 6738 case Instruction::FSub: 6739 case Instruction::GetElementPtr: 6740 case Instruction::ICmp: 6741 case Instruction::IntToPtr: 6742 case Instruction::Load: 6743 case Instruction::LShr: 6744 case Instruction::Mul: 6745 case Instruction::Or: 6746 case Instruction::PHI: 6747 case Instruction::PtrToInt: 6748 case Instruction::SDiv: 6749 case Instruction::Select: 6750 case Instruction::SExt: 6751 case Instruction::Shl: 6752 case Instruction::SIToFP: 6753 case Instruction::SRem: 6754 case Instruction::Store: 6755 case Instruction::Sub: 6756 case Instruction::Trunc: 6757 case Instruction::UDiv: 6758 case Instruction::UIToFP: 6759 case Instruction::URem: 6760 case Instruction::Xor: 6761 case Instruction::ZExt: 6762 return true; 6763 } 6764 return false; 6765 }; 6766 6767 if (!IsVectorizableOpcode(I->getOpcode())) 6768 return false; 6769 6770 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6771 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6772 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6773 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6774 return false; 6775 } 6776 6777 auto willWiden = [&](unsigned VF) -> bool { 6778 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || 6779 CM.isProfitableToScalarize(I, VF))) 6780 return false; 6781 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6782 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6783 // The following case may be scalarized depending on the VF. 6784 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6785 // version of the instruction. 6786 // Is it beneficial to perform intrinsic call compared to lib call? 6787 bool NeedToScalarize; 6788 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6789 bool UseVectorIntrinsic = 6790 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6791 return UseVectorIntrinsic || !NeedToScalarize; 6792 } 6793 if (isa<LoadInst>(I) || isa<StoreInst>(I)) { 6794 assert(CM.getWideningDecision(I, VF) == 6795 LoopVectorizationCostModel::CM_Scalarize && 6796 "Memory widening decisions should have been taken care by now"); 6797 return false; 6798 } 6799 return true; 6800 }; 6801 6802 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6803 return false; 6804 6805 // Success: widen this instruction. We optimize the common case where 6806 // consecutive instructions can be represented by a single recipe. 6807 if (!VPBB->empty()) { 6808 VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back()); 6809 if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I)) 6810 return true; 6811 } 6812 6813 VPBB->appendRecipe(new VPWidenRecipe(I)); 6814 return true; 6815 } 6816 6817 VPBasicBlock *VPRecipeBuilder::handleReplication( 6818 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 6819 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 6820 VPlanPtr &Plan) { 6821 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 6822 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 6823 Range); 6824 6825 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6826 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6827 6828 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); 6829 6830 // Find if I uses a predicated instruction. If so, it will use its scalar 6831 // value. Avoid hoisting the insert-element which packs the scalar value into 6832 // a vector value, as that happens iff all users use the vector value. 6833 for (auto &Op : I->operands()) 6834 if (auto *PredInst = dyn_cast<Instruction>(Op)) 6835 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 6836 PredInst2Recipe[PredInst]->setAlsoPack(false); 6837 6838 // Finalize the recipe for Instr, first if it is not predicated. 6839 if (!IsPredicated) { 6840 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 6841 VPBB->appendRecipe(Recipe); 6842 return VPBB; 6843 } 6844 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 6845 assert(VPBB->getSuccessors().empty() && 6846 "VPBB has successors when handling predicated replication."); 6847 // Record predicated instructions for above packing optimizations. 6848 PredInst2Recipe[I] = Recipe; 6849 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 6850 VPBlockUtils::insertBlockAfter(Region, VPBB); 6851 auto *RegSucc = new VPBasicBlock(); 6852 VPBlockUtils::insertBlockAfter(RegSucc, Region); 6853 return RegSucc; 6854 } 6855 6856 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 6857 VPRecipeBase *PredRecipe, 6858 VPlanPtr &Plan) { 6859 // Instructions marked for predication are replicated and placed under an 6860 // if-then construct to prevent side-effects. 6861 6862 // Generate recipes to compute the block mask for this region. 6863 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 6864 6865 // Build the triangular if-then region. 6866 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 6867 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 6868 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 6869 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 6870 auto *PHIRecipe = 6871 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 6872 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 6873 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 6874 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 6875 6876 // Note: first set Entry as region entry and then connect successors starting 6877 // from it in order, to propagate the "parent" of each VPBasicBlock. 6878 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 6879 VPBlockUtils::connectBlocks(Pred, Exit); 6880 6881 return Region; 6882 } 6883 6884 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, 6885 VPlanPtr &Plan, VPBasicBlock *VPBB) { 6886 VPRecipeBase *Recipe = nullptr; 6887 // Check if Instr should belong to an interleave memory recipe, or already 6888 // does. In the latter case Instr is irrelevant. 6889 if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) { 6890 VPBB->appendRecipe(Recipe); 6891 return true; 6892 } 6893 6894 // Check if Instr is a memory operation that should be widened. 6895 if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) { 6896 VPBB->appendRecipe(Recipe); 6897 return true; 6898 } 6899 6900 // Check if Instr should form some PHI recipe. 6901 if ((Recipe = tryToOptimizeInduction(Instr, Range))) { 6902 VPBB->appendRecipe(Recipe); 6903 return true; 6904 } 6905 if ((Recipe = tryToBlend(Instr, Plan))) { 6906 VPBB->appendRecipe(Recipe); 6907 return true; 6908 } 6909 if (PHINode *Phi = dyn_cast<PHINode>(Instr)) { 6910 VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); 6911 return true; 6912 } 6913 6914 // Check if Instr is to be widened by a general VPWidenRecipe, after 6915 // having first checked for specific widening recipes that deal with 6916 // Interleave Groups, Inductions and Phi nodes. 6917 if (tryToWiden(Instr, VPBB, Range)) 6918 return true; 6919 6920 return false; 6921 } 6922 6923 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 6924 unsigned MaxVF) { 6925 assert(OrigLoop->empty() && "Inner loop expected."); 6926 6927 // Collect conditions feeding internal conditional branches; they need to be 6928 // represented in VPlan for it to model masking. 6929 SmallPtrSet<Value *, 1> NeedDef; 6930 6931 auto *Latch = OrigLoop->getLoopLatch(); 6932 for (BasicBlock *BB : OrigLoop->blocks()) { 6933 if (BB == Latch) 6934 continue; 6935 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 6936 if (Branch && Branch->isConditional()) 6937 NeedDef.insert(Branch->getCondition()); 6938 } 6939 6940 // If the tail is to be folded by masking, the primary induction variable 6941 // needs to be represented in VPlan for it to model early-exit masking. 6942 if (CM.foldTailByMasking()) 6943 NeedDef.insert(Legal->getPrimaryInduction()); 6944 6945 // Collect instructions from the original loop that will become trivially dead 6946 // in the vectorized loop. We don't need to vectorize these instructions. For 6947 // example, original induction update instructions can become dead because we 6948 // separately emit induction "steps" when generating code for the new loop. 6949 // Similarly, we create a new latch condition when setting up the structure 6950 // of the new loop, so the old one can become dead. 6951 SmallPtrSet<Instruction *, 4> DeadInstructions; 6952 collectTriviallyDeadInstructions(DeadInstructions); 6953 6954 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6955 VFRange SubRange = {VF, MaxVF + 1}; 6956 VPlans.push_back( 6957 buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions)); 6958 VF = SubRange.End; 6959 } 6960 } 6961 6962 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 6963 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 6964 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6965 // Hold a mapping from predicated instructions to their recipes, in order to 6966 // fix their AlsoPack behavior if a user is determined to replicate and use a 6967 // scalar instead of vector value. 6968 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 6969 6970 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 6971 DenseMap<Instruction *, Instruction *> SinkAfterInverse; 6972 6973 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 6974 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 6975 auto Plan = std::make_unique<VPlan>(VPBB); 6976 6977 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); 6978 // Represent values that will have defs inside VPlan. 6979 for (Value *V : NeedDef) 6980 Plan->addVPValue(V); 6981 6982 // Scan the body of the loop in a topological order to visit each basic block 6983 // after having visited its predecessor basic blocks. 6984 LoopBlocksDFS DFS(OrigLoop); 6985 DFS.perform(LI); 6986 6987 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6988 // Relevant instructions from basic block BB will be grouped into VPRecipe 6989 // ingredients and fill a new VPBasicBlock. 6990 unsigned VPBBsForBB = 0; 6991 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 6992 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 6993 VPBB = FirstVPBBForBB; 6994 Builder.setInsertPoint(VPBB); 6995 6996 std::vector<Instruction *> Ingredients; 6997 6998 // Organize the ingredients to vectorize from current basic block in the 6999 // right order. 7000 for (Instruction &I : BB->instructionsWithoutDebug()) { 7001 Instruction *Instr = &I; 7002 7003 // First filter out irrelevant instructions, to ensure no recipes are 7004 // built for them. 7005 if (isa<BranchInst>(Instr) || 7006 DeadInstructions.find(Instr) != DeadInstructions.end()) 7007 continue; 7008 7009 // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct 7010 // member of the IG, do not construct any Recipe for it. 7011 const InterleaveGroup<Instruction> *IG = 7012 CM.getInterleavedAccessGroup(Instr); 7013 if (IG && Instr != IG->getInsertPos() && 7014 Range.Start >= 2 && // Query is illegal for VF == 1 7015 CM.getWideningDecision(Instr, Range.Start) == 7016 LoopVectorizationCostModel::CM_Interleave) { 7017 auto SinkCandidate = SinkAfterInverse.find(Instr); 7018 if (SinkCandidate != SinkAfterInverse.end()) 7019 Ingredients.push_back(SinkCandidate->second); 7020 continue; 7021 } 7022 7023 // Move instructions to handle first-order recurrences, step 1: avoid 7024 // handling this instruction until after we've handled the instruction it 7025 // should follow. 7026 auto SAIt = SinkAfter.find(Instr); 7027 if (SAIt != SinkAfter.end()) { 7028 LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" 7029 << *SAIt->second 7030 << " to vectorize a 1st order recurrence.\n"); 7031 SinkAfterInverse[SAIt->second] = Instr; 7032 continue; 7033 } 7034 7035 Ingredients.push_back(Instr); 7036 7037 // Move instructions to handle first-order recurrences, step 2: push the 7038 // instruction to be sunk at its insertion point. 7039 auto SAInvIt = SinkAfterInverse.find(Instr); 7040 if (SAInvIt != SinkAfterInverse.end()) 7041 Ingredients.push_back(SAInvIt->second); 7042 } 7043 7044 // Introduce each ingredient into VPlan. 7045 for (Instruction *Instr : Ingredients) { 7046 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) 7047 continue; 7048 7049 // Otherwise, if all widening options failed, Instruction is to be 7050 // replicated. This may create a successor for VPBB. 7051 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7052 Instr, Range, VPBB, PredInst2Recipe, Plan); 7053 if (NextVPBB != VPBB) { 7054 VPBB = NextVPBB; 7055 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7056 : ""); 7057 } 7058 } 7059 } 7060 7061 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7062 // may also be empty, such as the last one VPBB, reflecting original 7063 // basic-blocks with no recipes. 7064 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7065 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7066 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7067 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7068 delete PreEntry; 7069 7070 std::string PlanName; 7071 raw_string_ostream RSO(PlanName); 7072 unsigned VF = Range.Start; 7073 Plan->addVF(VF); 7074 RSO << "Initial VPlan for VF={" << VF; 7075 for (VF *= 2; VF < Range.End; VF *= 2) { 7076 Plan->addVF(VF); 7077 RSO << "," << VF; 7078 } 7079 RSO << "},UF>=1"; 7080 RSO.flush(); 7081 Plan->setName(PlanName); 7082 7083 return Plan; 7084 } 7085 7086 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7087 // Outer loop handling: They may require CFG and instruction level 7088 // transformations before even evaluating whether vectorization is profitable. 7089 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7090 // the vectorization pipeline. 7091 assert(!OrigLoop->empty()); 7092 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7093 7094 // Create new empty VPlan 7095 auto Plan = std::make_unique<VPlan>(); 7096 7097 // Build hierarchical CFG 7098 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7099 HCFGBuilder.buildHierarchicalCFG(); 7100 7101 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7102 Plan->addVF(VF); 7103 7104 if (EnableVPlanPredication) { 7105 VPlanPredicator VPP(*Plan); 7106 VPP.predicate(); 7107 7108 // Avoid running transformation to recipes until masked code generation in 7109 // VPlan-native path is in place. 7110 return Plan; 7111 } 7112 7113 SmallPtrSet<Instruction *, 1> DeadInstructions; 7114 VPlanHCFGTransforms::VPInstructionsToVPRecipes( 7115 Plan, Legal->getInductionVars(), DeadInstructions); 7116 7117 return Plan; 7118 } 7119 7120 Value* LoopVectorizationPlanner::VPCallbackILV:: 7121 getOrCreateVectorValues(Value *V, unsigned Part) { 7122 return ILV.getOrCreateVectorValue(V, Part); 7123 } 7124 7125 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { 7126 O << " +\n" 7127 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7128 IG->getInsertPos()->printAsOperand(O, false); 7129 if (User) { 7130 O << ", "; 7131 User->getOperand(0)->printAsOperand(O); 7132 } 7133 O << "\\l\""; 7134 for (unsigned i = 0; i < IG->getFactor(); ++i) 7135 if (Instruction *I = IG->getMember(i)) 7136 O << " +\n" 7137 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; 7138 } 7139 7140 void VPWidenRecipe::execute(VPTransformState &State) { 7141 for (auto &Instr : make_range(Begin, End)) 7142 State.ILV->widenInstruction(Instr); 7143 } 7144 7145 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7146 assert(!State.Instance && "Int or FP induction being replicated."); 7147 State.ILV->widenIntOrFpInduction(IV, Trunc); 7148 } 7149 7150 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7151 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7152 } 7153 7154 void VPBlendRecipe::execute(VPTransformState &State) { 7155 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7156 // We know that all PHIs in non-header blocks are converted into 7157 // selects, so we don't have to worry about the insertion order and we 7158 // can just use the builder. 7159 // At this point we generate the predication tree. There may be 7160 // duplications since this is a simple recursive scan, but future 7161 // optimizations will clean it up. 7162 7163 unsigned NumIncoming = Phi->getNumIncomingValues(); 7164 7165 assert((User || NumIncoming == 1) && 7166 "Multiple predecessors with predecessors having a full mask"); 7167 // Generate a sequence of selects of the form: 7168 // SELECT(Mask3, In3, 7169 // SELECT(Mask2, In2, 7170 // ( ...))) 7171 InnerLoopVectorizer::VectorParts Entry(State.UF); 7172 for (unsigned In = 0; In < NumIncoming; ++In) { 7173 for (unsigned Part = 0; Part < State.UF; ++Part) { 7174 // We might have single edge PHIs (blocks) - use an identity 7175 // 'select' for the first PHI operand. 7176 Value *In0 = 7177 State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part); 7178 if (In == 0) 7179 Entry[Part] = In0; // Initialize with the first incoming value. 7180 else { 7181 // Select between the current value and the previous incoming edge 7182 // based on the incoming mask. 7183 Value *Cond = State.get(User->getOperand(In), Part); 7184 Entry[Part] = 7185 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7186 } 7187 } 7188 } 7189 for (unsigned Part = 0; Part < State.UF; ++Part) 7190 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7191 } 7192 7193 void VPInterleaveRecipe::execute(VPTransformState &State) { 7194 assert(!State.Instance && "Interleave group being replicated."); 7195 if (!User) 7196 return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); 7197 7198 // Last (and currently only) operand is a mask. 7199 InnerLoopVectorizer::VectorParts MaskValues(State.UF); 7200 VPValue *Mask = User->getOperand(User->getNumOperands() - 1); 7201 for (unsigned Part = 0; Part < State.UF; ++Part) 7202 MaskValues[Part] = State.get(Mask, Part); 7203 State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues); 7204 } 7205 7206 void VPReplicateRecipe::execute(VPTransformState &State) { 7207 if (State.Instance) { // Generate a single instance. 7208 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); 7209 // Insert scalar instance packing it into a vector. 7210 if (AlsoPack && State.VF > 1) { 7211 // If we're constructing lane 0, initialize to start from undef. 7212 if (State.Instance->Lane == 0) { 7213 Value *Undef = 7214 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7215 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7216 } 7217 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7218 } 7219 return; 7220 } 7221 7222 // Generate scalar instances for all VF lanes of all UF parts, unless the 7223 // instruction is uniform inwhich case generate only the first lane for each 7224 // of the UF parts. 7225 unsigned EndLane = IsUniform ? 1 : State.VF; 7226 for (unsigned Part = 0; Part < State.UF; ++Part) 7227 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7228 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); 7229 } 7230 7231 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7232 assert(State.Instance && "Branch on Mask works only on single instance."); 7233 7234 unsigned Part = State.Instance->Part; 7235 unsigned Lane = State.Instance->Lane; 7236 7237 Value *ConditionBit = nullptr; 7238 if (!User) // Block in mask is all-one. 7239 ConditionBit = State.Builder.getTrue(); 7240 else { 7241 VPValue *BlockInMask = User->getOperand(0); 7242 ConditionBit = State.get(BlockInMask, Part); 7243 if (ConditionBit->getType()->isVectorTy()) 7244 ConditionBit = State.Builder.CreateExtractElement( 7245 ConditionBit, State.Builder.getInt32(Lane)); 7246 } 7247 7248 // Replace the temporary unreachable terminator with a new conditional branch, 7249 // whose two destinations will be set later when they are created. 7250 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7251 assert(isa<UnreachableInst>(CurrentTerminator) && 7252 "Expected to replace unreachable terminator with conditional branch."); 7253 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7254 CondBr->setSuccessor(0, nullptr); 7255 ReplaceInstWithInst(CurrentTerminator, CondBr); 7256 } 7257 7258 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7259 assert(State.Instance && "Predicated instruction PHI works per instance."); 7260 Instruction *ScalarPredInst = cast<Instruction>( 7261 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7262 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7263 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7264 assert(PredicatingBB && "Predicated block has no single predecessor."); 7265 7266 // By current pack/unpack logic we need to generate only a single phi node: if 7267 // a vector value for the predicated instruction exists at this point it means 7268 // the instruction has vector users only, and a phi for the vector value is 7269 // needed. In this case the recipe of the predicated instruction is marked to 7270 // also do that packing, thereby "hoisting" the insert-element sequence. 7271 // Otherwise, a phi node for the scalar value is needed. 7272 unsigned Part = State.Instance->Part; 7273 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7274 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7275 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7276 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7277 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7278 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7279 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7280 } else { 7281 Type *PredInstType = PredInst->getType(); 7282 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7283 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7284 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7285 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7286 } 7287 } 7288 7289 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7290 if (!User) 7291 return State.ILV->vectorizeMemoryInstruction(&Instr); 7292 7293 // Last (and currently only) operand is a mask. 7294 InnerLoopVectorizer::VectorParts MaskValues(State.UF); 7295 VPValue *Mask = User->getOperand(User->getNumOperands() - 1); 7296 for (unsigned Part = 0; Part < State.UF; ++Part) 7297 MaskValues[Part] = State.get(Mask, Part); 7298 State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); 7299 } 7300 7301 static ScalarEpilogueLowering 7302 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, 7303 ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { 7304 ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; 7305 if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && 7306 (F->hasOptSize() || 7307 llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) 7308 SEL = CM_ScalarEpilogueNotAllowedOptSize; 7309 else if (PreferPredicateOverEpilog || Hints.getPredicate()) 7310 SEL = CM_ScalarEpilogueNotNeededUsePredicate; 7311 7312 return SEL; 7313 } 7314 7315 // Process the loop in the VPlan-native vectorization path. This path builds 7316 // VPlan upfront in the vectorization pipeline, which allows to apply 7317 // VPlan-to-VPlan transformations from the very beginning without modifying the 7318 // input LLVM IR. 7319 static bool processLoopInVPlanNativePath( 7320 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7321 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7322 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7323 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7324 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7325 7326 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7327 Function *F = L->getHeader()->getParent(); 7328 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7329 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); 7330 7331 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7332 &Hints, IAI); 7333 // Use the planner for outer loop vectorization. 7334 // TODO: CM is not used at this point inside the planner. Turn CM into an 7335 // optional argument if we don't need it in the future. 7336 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM); 7337 7338 // Get user vectorization factor. 7339 const unsigned UserVF = Hints.getWidth(); 7340 7341 // Plan how to best vectorize, return the best VF and its cost. 7342 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7343 7344 // If we are stress testing VPlan builds, do not attempt to generate vector 7345 // code. Masked vector code generation support will follow soon. 7346 // Also, do not attempt to vectorize if no vector code will be produced. 7347 if (VPlanBuildStressTest || EnableVPlanPredication || 7348 VectorizationFactor::Disabled() == VF) 7349 return false; 7350 7351 LVP.setBestPlan(VF.Width, 1); 7352 7353 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7354 &CM); 7355 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7356 << L->getHeader()->getParent()->getName() << "\"\n"); 7357 LVP.executePlan(LB, DT); 7358 7359 // Mark the loop as already vectorized to avoid vectorizing again. 7360 Hints.setAlreadyVectorized(); 7361 7362 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7363 return true; 7364 } 7365 7366 bool LoopVectorizePass::processLoop(Loop *L) { 7367 assert((EnableVPlanNativePath || L->empty()) && 7368 "VPlan-native path is not enabled. Only process inner loops."); 7369 7370 #ifndef NDEBUG 7371 const std::string DebugLocStr = getDebugLocString(L); 7372 #endif /* NDEBUG */ 7373 7374 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7375 << L->getHeader()->getParent()->getName() << "\" from " 7376 << DebugLocStr << "\n"); 7377 7378 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7379 7380 LLVM_DEBUG( 7381 dbgs() << "LV: Loop hints:" 7382 << " force=" 7383 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7384 ? "disabled" 7385 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7386 ? "enabled" 7387 : "?")) 7388 << " width=" << Hints.getWidth() 7389 << " unroll=" << Hints.getInterleave() << "\n"); 7390 7391 // Function containing loop 7392 Function *F = L->getHeader()->getParent(); 7393 7394 // Looking at the diagnostic output is the only way to determine if a loop 7395 // was vectorized (other than looking at the IR or machine code), so it 7396 // is important to generate an optimization remark for each loop. Most of 7397 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7398 // generated as OptimizationRemark and OptimizationRemarkMissed are 7399 // less verbose reporting vectorized loops and unvectorized loops that may 7400 // benefit from vectorization, respectively. 7401 7402 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7403 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7404 return false; 7405 } 7406 7407 PredicatedScalarEvolution PSE(*SE, *L); 7408 7409 // Check if it is legal to vectorize the loop. 7410 LoopVectorizationRequirements Requirements(*ORE); 7411 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7412 &Requirements, &Hints, DB, AC); 7413 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7414 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7415 Hints.emitRemarkWithHints(); 7416 return false; 7417 } 7418 7419 // Check the function attributes and profiles to find out if this function 7420 // should be optimized for size. 7421 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); 7422 7423 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7424 // here. They may require CFG and instruction level transformations before 7425 // even evaluating whether vectorization is profitable. Since we cannot modify 7426 // the incoming IR, we need to build VPlan upfront in the vectorization 7427 // pipeline. 7428 if (!L->empty()) 7429 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7430 ORE, BFI, PSI, Hints); 7431 7432 assert(L->empty() && "Inner loop expected."); 7433 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7434 // count by optimizing for size, to minimize overheads. 7435 // Prefer constant trip counts over profile data, over upper bound estimate. 7436 unsigned ExpectedTC = 0; 7437 bool HasExpectedTC = false; 7438 if (const SCEVConstant *ConstExits = 7439 dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) { 7440 const APInt &ExitsCount = ConstExits->getAPInt(); 7441 // We are interested in small values for ExpectedTC. Skip over those that 7442 // can't fit an unsigned. 7443 if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) { 7444 ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1; 7445 HasExpectedTC = true; 7446 } 7447 } 7448 // ExpectedTC may be large because it's bound by a variable. Check 7449 // profiling information to validate we should vectorize. 7450 if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) { 7451 auto EstimatedTC = getLoopEstimatedTripCount(L); 7452 if (EstimatedTC) { 7453 ExpectedTC = *EstimatedTC; 7454 HasExpectedTC = true; 7455 } 7456 } 7457 if (!HasExpectedTC) { 7458 ExpectedTC = SE->getSmallConstantMaxTripCount(L); 7459 HasExpectedTC = (ExpectedTC > 0); 7460 } 7461 7462 if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) { 7463 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7464 << "This loop is worth vectorizing only if no scalar " 7465 << "iteration overheads are incurred."); 7466 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7467 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7468 else { 7469 LLVM_DEBUG(dbgs() << "\n"); 7470 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7471 } 7472 } 7473 7474 // Check the function attributes to see if implicit floats are allowed. 7475 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7476 // an integer loop and the vector instructions selected are purely integer 7477 // vector instructions? 7478 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7479 reportVectorizationFailure( 7480 "Can't vectorize when the NoImplicitFloat attribute is used", 7481 "loop not vectorized due to NoImplicitFloat attribute", 7482 "NoImplicitFloat", ORE, L); 7483 Hints.emitRemarkWithHints(); 7484 return false; 7485 } 7486 7487 // Check if the target supports potentially unsafe FP vectorization. 7488 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7489 // for the target we're vectorizing for, to make sure none of the 7490 // additional fp-math flags can help. 7491 if (Hints.isPotentiallyUnsafe() && 7492 TTI->isFPVectorizationPotentiallyUnsafe()) { 7493 reportVectorizationFailure( 7494 "Potentially unsafe FP op prevents vectorization", 7495 "loop not vectorized due to unsafe FP support.", 7496 "UnsafeFP", ORE, L); 7497 Hints.emitRemarkWithHints(); 7498 return false; 7499 } 7500 7501 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7502 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7503 7504 // If an override option has been passed in for interleaved accesses, use it. 7505 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7506 UseInterleaved = EnableInterleavedMemAccesses; 7507 7508 // Analyze interleaved memory accesses. 7509 if (UseInterleaved) { 7510 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7511 } 7512 7513 // Use the cost model. 7514 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7515 F, &Hints, IAI); 7516 CM.collectValuesToIgnore(); 7517 7518 // Use the planner for vectorization. 7519 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM); 7520 7521 // Get user vectorization factor. 7522 unsigned UserVF = Hints.getWidth(); 7523 7524 // Plan how to best vectorize, return the best VF and its cost. 7525 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); 7526 7527 VectorizationFactor VF = VectorizationFactor::Disabled(); 7528 unsigned IC = 1; 7529 unsigned UserIC = Hints.getInterleave(); 7530 7531 if (MaybeVF) { 7532 VF = *MaybeVF; 7533 // Select the interleave count. 7534 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7535 } 7536 7537 // Identify the diagnostic messages that should be produced. 7538 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7539 bool VectorizeLoop = true, InterleaveLoop = true; 7540 if (Requirements.doesNotMeet(F, L, Hints)) { 7541 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7542 "requirements.\n"); 7543 Hints.emitRemarkWithHints(); 7544 return false; 7545 } 7546 7547 if (VF.Width == 1) { 7548 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7549 VecDiagMsg = std::make_pair( 7550 "VectorizationNotBeneficial", 7551 "the cost-model indicates that vectorization is not beneficial"); 7552 VectorizeLoop = false; 7553 } 7554 7555 if (!MaybeVF && UserIC > 1) { 7556 // Tell the user interleaving was avoided up-front, despite being explicitly 7557 // requested. 7558 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7559 "interleaving should be avoided up front\n"); 7560 IntDiagMsg = std::make_pair( 7561 "InterleavingAvoided", 7562 "Ignoring UserIC, because interleaving was avoided up front"); 7563 InterleaveLoop = false; 7564 } else if (IC == 1 && UserIC <= 1) { 7565 // Tell the user interleaving is not beneficial. 7566 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7567 IntDiagMsg = std::make_pair( 7568 "InterleavingNotBeneficial", 7569 "the cost-model indicates that interleaving is not beneficial"); 7570 InterleaveLoop = false; 7571 if (UserIC == 1) { 7572 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7573 IntDiagMsg.second += 7574 " and is explicitly disabled or interleave count is set to 1"; 7575 } 7576 } else if (IC > 1 && UserIC == 1) { 7577 // Tell the user interleaving is beneficial, but it explicitly disabled. 7578 LLVM_DEBUG( 7579 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7580 IntDiagMsg = std::make_pair( 7581 "InterleavingBeneficialButDisabled", 7582 "the cost-model indicates that interleaving is beneficial " 7583 "but is explicitly disabled or interleave count is set to 1"); 7584 InterleaveLoop = false; 7585 } 7586 7587 // Override IC if user provided an interleave count. 7588 IC = UserIC > 0 ? UserIC : IC; 7589 7590 // Emit diagnostic messages, if any. 7591 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7592 if (!VectorizeLoop && !InterleaveLoop) { 7593 // Do not vectorize or interleaving the loop. 7594 ORE->emit([&]() { 7595 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7596 L->getStartLoc(), L->getHeader()) 7597 << VecDiagMsg.second; 7598 }); 7599 ORE->emit([&]() { 7600 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7601 L->getStartLoc(), L->getHeader()) 7602 << IntDiagMsg.second; 7603 }); 7604 return false; 7605 } else if (!VectorizeLoop && InterleaveLoop) { 7606 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7607 ORE->emit([&]() { 7608 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7609 L->getStartLoc(), L->getHeader()) 7610 << VecDiagMsg.second; 7611 }); 7612 } else if (VectorizeLoop && !InterleaveLoop) { 7613 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7614 << ") in " << DebugLocStr << '\n'); 7615 ORE->emit([&]() { 7616 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7617 L->getStartLoc(), L->getHeader()) 7618 << IntDiagMsg.second; 7619 }); 7620 } else if (VectorizeLoop && InterleaveLoop) { 7621 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7622 << ") in " << DebugLocStr << '\n'); 7623 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7624 } 7625 7626 LVP.setBestPlan(VF.Width, IC); 7627 7628 using namespace ore; 7629 bool DisableRuntimeUnroll = false; 7630 MDNode *OrigLoopID = L->getLoopID(); 7631 7632 if (!VectorizeLoop) { 7633 assert(IC > 1 && "interleave count should not be 1 or 0"); 7634 // If we decided that it is not legal to vectorize the loop, then 7635 // interleave it. 7636 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7637 &CM); 7638 LVP.executePlan(Unroller, DT); 7639 7640 ORE->emit([&]() { 7641 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7642 L->getHeader()) 7643 << "interleaved loop (interleaved count: " 7644 << NV("InterleaveCount", IC) << ")"; 7645 }); 7646 } else { 7647 // If we decided that it is *legal* to vectorize the loop, then do it. 7648 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7649 &LVL, &CM); 7650 LVP.executePlan(LB, DT); 7651 ++LoopsVectorized; 7652 7653 // Add metadata to disable runtime unrolling a scalar loop when there are 7654 // no runtime checks about strides and memory. A scalar loop that is 7655 // rarely used is not worth unrolling. 7656 if (!LB.areSafetyChecksAdded()) 7657 DisableRuntimeUnroll = true; 7658 7659 // Report the vectorization decision. 7660 ORE->emit([&]() { 7661 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7662 L->getHeader()) 7663 << "vectorized loop (vectorization width: " 7664 << NV("VectorizationFactor", VF.Width) 7665 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7666 }); 7667 } 7668 7669 Optional<MDNode *> RemainderLoopID = 7670 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7671 LLVMLoopVectorizeFollowupEpilogue}); 7672 if (RemainderLoopID.hasValue()) { 7673 L->setLoopID(RemainderLoopID.getValue()); 7674 } else { 7675 if (DisableRuntimeUnroll) 7676 AddRuntimeUnrollDisableMetaData(L); 7677 7678 // Mark the loop as already vectorized to avoid vectorizing again. 7679 Hints.setAlreadyVectorized(); 7680 } 7681 7682 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7683 return true; 7684 } 7685 7686 bool LoopVectorizePass::runImpl( 7687 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7688 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7689 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 7690 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 7691 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 7692 SE = &SE_; 7693 LI = &LI_; 7694 TTI = &TTI_; 7695 DT = &DT_; 7696 BFI = &BFI_; 7697 TLI = TLI_; 7698 AA = &AA_; 7699 AC = &AC_; 7700 GetLAA = &GetLAA_; 7701 DB = &DB_; 7702 ORE = &ORE_; 7703 PSI = PSI_; 7704 7705 // Don't attempt if 7706 // 1. the target claims to have no vector registers, and 7707 // 2. interleaving won't help ILP. 7708 // 7709 // The second condition is necessary because, even if the target has no 7710 // vector registers, loop vectorization may still enable scalar 7711 // interleaving. 7712 if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) 7713 return false; 7714 7715 bool Changed = false; 7716 7717 // The vectorizer requires loops to be in simplified form. 7718 // Since simplification may add new inner loops, it has to run before the 7719 // legality and profitability checks. This means running the loop vectorizer 7720 // will simplify all loops, regardless of whether anything end up being 7721 // vectorized. 7722 for (auto &L : *LI) 7723 Changed |= 7724 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 7725 7726 // Build up a worklist of inner-loops to vectorize. This is necessary as 7727 // the act of vectorizing or partially unrolling a loop creates new loops 7728 // and can invalidate iterators across the loops. 7729 SmallVector<Loop *, 8> Worklist; 7730 7731 for (Loop *L : *LI) 7732 collectSupportedLoops(*L, LI, ORE, Worklist); 7733 7734 LoopsAnalyzed += Worklist.size(); 7735 7736 // Now walk the identified inner loops. 7737 while (!Worklist.empty()) { 7738 Loop *L = Worklist.pop_back_val(); 7739 7740 // For the inner loops we actually process, form LCSSA to simplify the 7741 // transform. 7742 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 7743 7744 Changed |= processLoop(L); 7745 } 7746 7747 // Process each loop nest in the function. 7748 return Changed; 7749 } 7750 7751 PreservedAnalyses LoopVectorizePass::run(Function &F, 7752 FunctionAnalysisManager &AM) { 7753 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 7754 auto &LI = AM.getResult<LoopAnalysis>(F); 7755 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 7756 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 7757 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 7758 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 7759 auto &AA = AM.getResult<AAManager>(F); 7760 auto &AC = AM.getResult<AssumptionAnalysis>(F); 7761 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 7762 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 7763 MemorySSA *MSSA = EnableMSSALoopDependency 7764 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 7765 : nullptr; 7766 7767 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 7768 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 7769 [&](Loop &L) -> const LoopAccessInfo & { 7770 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 7771 return LAM.getResult<LoopAccessAnalysis>(L, AR); 7772 }; 7773 const ModuleAnalysisManager &MAM = 7774 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); 7775 ProfileSummaryInfo *PSI = 7776 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 7777 bool Changed = 7778 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 7779 if (!Changed) 7780 return PreservedAnalyses::all(); 7781 PreservedAnalyses PA; 7782 7783 // We currently do not preserve loopinfo/dominator analyses with outer loop 7784 // vectorization. Until this is addressed, mark these analyses as preserved 7785 // only for non-VPlan-native path. 7786 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 7787 if (!EnableVPlanNativePath) { 7788 PA.preserve<LoopAnalysis>(); 7789 PA.preserve<DominatorTreeAnalysis>(); 7790 } 7791 PA.preserve<BasicAA>(); 7792 PA.preserve<GlobalsAA>(); 7793 return PA; 7794 } 7795