1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanHCFGTransforms.h" 62 #include "VPlanPredicator.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpander.h" 95 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 96 #include "llvm/Analysis/TargetLibraryInfo.h" 97 #include "llvm/Analysis/TargetTransformInfo.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfoMetadata.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/LLVMContext.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/LoopSimplify.h" 137 #include "llvm/Transforms/Utils/LoopUtils.h" 138 #include "llvm/Transforms/Utils/LoopVersioning.h" 139 #include "llvm/Transforms/Utils/SizeOpts.h" 140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 141 #include <algorithm> 142 #include <cassert> 143 #include <cstdint> 144 #include <cstdlib> 145 #include <functional> 146 #include <iterator> 147 #include <limits> 148 #include <memory> 149 #include <string> 150 #include <tuple> 151 #include <utility> 152 #include <vector> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 /// @{ 160 /// Metadata attribute names 161 static const char *const LLVMLoopVectorizeFollowupAll = 162 "llvm.loop.vectorize.followup_all"; 163 static const char *const LLVMLoopVectorizeFollowupVectorized = 164 "llvm.loop.vectorize.followup_vectorized"; 165 static const char *const LLVMLoopVectorizeFollowupEpilogue = 166 "llvm.loop.vectorize.followup_epilogue"; 167 /// @} 168 169 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 171 172 /// Loops with a known constant trip count below this number are vectorized only 173 /// if no scalar iteration overheads are incurred. 174 static cl::opt<unsigned> TinyTripCountVectorThreshold( 175 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 176 cl::desc("Loops with a constant trip count that is smaller than this " 177 "value are vectorized only if no scalar iteration overheads " 178 "are incurred.")); 179 180 static cl::opt<bool> MaximizeBandwidth( 181 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 182 cl::desc("Maximize bandwidth when selecting vectorization factor which " 183 "will be determined by the smallest type in loop.")); 184 185 static cl::opt<bool> EnableInterleavedMemAccesses( 186 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 187 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 188 189 /// An interleave-group may need masking if it resides in a block that needs 190 /// predication, or in order to mask away gaps. 191 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 192 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 193 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 194 195 /// We don't interleave loops with a known constant trip count below this 196 /// number. 197 static const unsigned TinyTripCountInterleaveThreshold = 128; 198 199 static cl::opt<unsigned> ForceTargetNumScalarRegs( 200 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 201 cl::desc("A flag that overrides the target's number of scalar registers.")); 202 203 static cl::opt<unsigned> ForceTargetNumVectorRegs( 204 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 205 cl::desc("A flag that overrides the target's number of vector registers.")); 206 207 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 208 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 209 cl::desc("A flag that overrides the target's max interleave factor for " 210 "scalar loops.")); 211 212 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 213 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 214 cl::desc("A flag that overrides the target's max interleave factor for " 215 "vectorized loops.")); 216 217 static cl::opt<unsigned> ForceTargetInstructionCost( 218 "force-target-instruction-cost", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's expected cost for " 220 "an instruction to a single constant value. Mostly " 221 "useful for getting consistent testing.")); 222 223 static cl::opt<unsigned> SmallLoopCost( 224 "small-loop-cost", cl::init(20), cl::Hidden, 225 cl::desc( 226 "The cost of a loop that is considered 'small' by the interleaver.")); 227 228 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 229 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 230 cl::desc("Enable the use of the block frequency analysis to access PGO " 231 "heuristics minimizing code growth in cold regions and being more " 232 "aggressive in hot regions.")); 233 234 // Runtime interleave loops for load/store throughput. 235 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 236 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 237 cl::desc( 238 "Enable runtime interleaving until load/store ports are saturated")); 239 240 /// The number of stores in a loop that are allowed to need predication. 241 static cl::opt<unsigned> NumberOfStoresToPredicate( 242 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 243 cl::desc("Max number of stores to be predicated behind an if.")); 244 245 static cl::opt<bool> EnableIndVarRegisterHeur( 246 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 247 cl::desc("Count the induction variable only once when interleaving")); 248 249 static cl::opt<bool> EnableCondStoresVectorization( 250 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 251 cl::desc("Enable if predication of stores during vectorization.")); 252 253 static cl::opt<unsigned> MaxNestedScalarReductionIC( 254 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 255 cl::desc("The maximum interleave count to use when interleaving a scalar " 256 "reduction in a nested loop.")); 257 258 cl::opt<bool> EnableVPlanNativePath( 259 "enable-vplan-native-path", cl::init(false), cl::Hidden, 260 cl::desc("Enable VPlan-native vectorization path with " 261 "support for outer loop vectorization.")); 262 263 // FIXME: Remove this switch once we have divergence analysis. Currently we 264 // assume divergent non-backedge branches when this switch is true. 265 cl::opt<bool> EnableVPlanPredication( 266 "enable-vplan-predication", cl::init(false), cl::Hidden, 267 cl::desc("Enable VPlan-native vectorization path predicator with " 268 "support for outer loop vectorization.")); 269 270 // This flag enables the stress testing of the VPlan H-CFG construction in the 271 // VPlan-native vectorization path. It must be used in conjuction with 272 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 273 // verification of the H-CFGs built. 274 static cl::opt<bool> VPlanBuildStressTest( 275 "vplan-build-stress-test", cl::init(false), cl::Hidden, 276 cl::desc( 277 "Build VPlan for every supported loop nest in the function and bail " 278 "out right after the build (stress test the VPlan H-CFG construction " 279 "in the VPlan-native vectorization path).")); 280 281 cl::opt<bool> llvm::EnableLoopInterleaving( 282 "interleave-loops", cl::init(true), cl::Hidden, 283 cl::desc("Enable loop interleaving in Loop vectorization passes")); 284 cl::opt<bool> llvm::EnableLoopVectorization( 285 "vectorize-loops", cl::init(true), cl::Hidden, 286 cl::desc("Run the Loop vectorization passes")); 287 288 /// A helper function for converting Scalar types to vector types. 289 /// If the incoming type is void, we return void. If the VF is 1, we return 290 /// the scalar type. 291 static Type *ToVectorTy(Type *Scalar, unsigned VF) { 292 if (Scalar->isVoidTy() || VF == 1) 293 return Scalar; 294 return VectorType::get(Scalar, VF); 295 } 296 297 /// A helper function that returns the type of loaded or stored value. 298 static Type *getMemInstValueType(Value *I) { 299 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 300 "Expected Load or Store instruction"); 301 if (auto *LI = dyn_cast<LoadInst>(I)) 302 return LI->getType(); 303 return cast<StoreInst>(I)->getValueOperand()->getType(); 304 } 305 306 /// A helper function that returns true if the given type is irregular. The 307 /// type is irregular if its allocated size doesn't equal the store size of an 308 /// element of the corresponding vector type at the given vectorization factor. 309 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 310 // Determine if an array of VF elements of type Ty is "bitcast compatible" 311 // with a <VF x Ty> vector. 312 if (VF > 1) { 313 auto *VectorTy = VectorType::get(Ty, VF); 314 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 315 } 316 317 // If the vectorization factor is one, we just check if an array of type Ty 318 // requires padding between elements. 319 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 320 } 321 322 /// A helper function that returns the reciprocal of the block probability of 323 /// predicated blocks. If we return X, we are assuming the predicated block 324 /// will execute once for every X iterations of the loop header. 325 /// 326 /// TODO: We should use actual block probability here, if available. Currently, 327 /// we always assume predicated blocks have a 50% chance of executing. 328 static unsigned getReciprocalPredBlockProb() { return 2; } 329 330 /// A helper function that adds a 'fast' flag to floating-point operations. 331 static Value *addFastMathFlag(Value *V) { 332 if (isa<FPMathOperator>(V)) 333 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 334 return V; 335 } 336 337 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 338 if (isa<FPMathOperator>(V)) 339 cast<Instruction>(V)->setFastMathFlags(FMF); 340 return V; 341 } 342 343 /// A helper function that returns an integer or floating-point constant with 344 /// value C. 345 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 346 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 347 : ConstantFP::get(Ty, C); 348 } 349 350 namespace llvm { 351 352 /// InnerLoopVectorizer vectorizes loops which contain only one basic 353 /// block to a specified vectorization factor (VF). 354 /// This class performs the widening of scalars into vectors, or multiple 355 /// scalars. This class also implements the following features: 356 /// * It inserts an epilogue loop for handling loops that don't have iteration 357 /// counts that are known to be a multiple of the vectorization factor. 358 /// * It handles the code generation for reduction variables. 359 /// * Scalarization (implementation using scalars) of un-vectorizable 360 /// instructions. 361 /// InnerLoopVectorizer does not perform any vectorization-legality 362 /// checks, and relies on the caller to check for the different legality 363 /// aspects. The InnerLoopVectorizer relies on the 364 /// LoopVectorizationLegality class to provide information about the induction 365 /// and reduction variables that were found to a given vectorization factor. 366 class InnerLoopVectorizer { 367 public: 368 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 369 LoopInfo *LI, DominatorTree *DT, 370 const TargetLibraryInfo *TLI, 371 const TargetTransformInfo *TTI, AssumptionCache *AC, 372 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 373 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 374 LoopVectorizationCostModel *CM) 375 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 376 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 377 Builder(PSE.getSE()->getContext()), 378 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 379 virtual ~InnerLoopVectorizer() = default; 380 381 /// Create a new empty loop. Unlink the old loop and connect the new one. 382 /// Return the pre-header block of the new loop. 383 BasicBlock *createVectorizedLoopSkeleton(); 384 385 /// Widen a single instruction within the innermost loop. 386 void widenInstruction(Instruction &I); 387 388 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 389 void fixVectorizedLoop(); 390 391 // Return true if any runtime check is added. 392 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 393 394 /// A type for vectorized values in the new loop. Each value from the 395 /// original loop, when vectorized, is represented by UF vector values in the 396 /// new unrolled loop, where UF is the unroll factor. 397 using VectorParts = SmallVector<Value *, 2>; 398 399 /// Vectorize a single PHINode in a block. This method handles the induction 400 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 401 /// arbitrary length vectors. 402 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 403 404 /// A helper function to scalarize a single Instruction in the innermost loop. 405 /// Generates a sequence of scalar instances for each lane between \p MinLane 406 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 407 /// inclusive.. 408 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, 409 bool IfPredicateInstr); 410 411 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 412 /// is provided, the integer induction variable will first be truncated to 413 /// the corresponding type. 414 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 415 416 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 417 /// vector or scalar value on-demand if one is not yet available. When 418 /// vectorizing a loop, we visit the definition of an instruction before its 419 /// uses. When visiting the definition, we either vectorize or scalarize the 420 /// instruction, creating an entry for it in the corresponding map. (In some 421 /// cases, such as induction variables, we will create both vector and scalar 422 /// entries.) Then, as we encounter uses of the definition, we derive values 423 /// for each scalar or vector use unless such a value is already available. 424 /// For example, if we scalarize a definition and one of its uses is vector, 425 /// we build the required vector on-demand with an insertelement sequence 426 /// when visiting the use. Otherwise, if the use is scalar, we can use the 427 /// existing scalar definition. 428 /// 429 /// Return a value in the new loop corresponding to \p V from the original 430 /// loop at unroll index \p Part. If the value has already been vectorized, 431 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 432 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 433 /// a new vector value on-demand by inserting the scalar values into a vector 434 /// with an insertelement sequence. If the value has been neither vectorized 435 /// nor scalarized, it must be loop invariant, so we simply broadcast the 436 /// value into a vector. 437 Value *getOrCreateVectorValue(Value *V, unsigned Part); 438 439 /// Return a value in the new loop corresponding to \p V from the original 440 /// loop at unroll and vector indices \p Instance. If the value has been 441 /// vectorized but not scalarized, the necessary extractelement instruction 442 /// will be generated. 443 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 444 445 /// Construct the vector value of a scalarized value \p V one lane at a time. 446 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 447 448 /// Try to vectorize the interleaved access group that \p Instr belongs to, 449 /// optionally masking the vector operations if \p BlockInMask is non-null. 450 void vectorizeInterleaveGroup(Instruction *Instr, 451 VectorParts *BlockInMask = nullptr); 452 453 /// Vectorize Load and Store instructions, optionally masking the vector 454 /// operations if \p BlockInMask is non-null. 455 void vectorizeMemoryInstruction(Instruction *Instr, 456 VectorParts *BlockInMask = nullptr); 457 458 /// Set the debug location in the builder using the debug location in 459 /// the instruction. 460 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 461 462 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 463 void fixNonInductionPHIs(void); 464 465 protected: 466 friend class LoopVectorizationPlanner; 467 468 /// A small list of PHINodes. 469 using PhiVector = SmallVector<PHINode *, 4>; 470 471 /// A type for scalarized values in the new loop. Each value from the 472 /// original loop, when scalarized, is represented by UF x VF scalar values 473 /// in the new unrolled loop, where UF is the unroll factor and VF is the 474 /// vectorization factor. 475 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 476 477 /// Set up the values of the IVs correctly when exiting the vector loop. 478 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 479 Value *CountRoundDown, Value *EndValue, 480 BasicBlock *MiddleBlock); 481 482 /// Create a new induction variable inside L. 483 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 484 Value *Step, Instruction *DL); 485 486 /// Handle all cross-iteration phis in the header. 487 void fixCrossIterationPHIs(); 488 489 /// Fix a first-order recurrence. This is the second phase of vectorizing 490 /// this phi node. 491 void fixFirstOrderRecurrence(PHINode *Phi); 492 493 /// Fix a reduction cross-iteration phi. This is the second phase of 494 /// vectorizing this phi node. 495 void fixReduction(PHINode *Phi); 496 497 /// The Loop exit block may have single value PHI nodes with some 498 /// incoming value. While vectorizing we only handled real values 499 /// that were defined inside the loop and we should have one value for 500 /// each predecessor of its parent basic block. See PR14725. 501 void fixLCSSAPHIs(); 502 503 /// Iteratively sink the scalarized operands of a predicated instruction into 504 /// the block that was created for it. 505 void sinkScalarOperands(Instruction *PredInst); 506 507 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 508 /// represented as. 509 void truncateToMinimalBitwidths(); 510 511 /// Insert the new loop to the loop hierarchy and pass manager 512 /// and update the analysis passes. 513 void updateAnalysis(); 514 515 /// Create a broadcast instruction. This method generates a broadcast 516 /// instruction (shuffle) for loop invariant values and for the induction 517 /// value. If this is the induction variable then we extend it to N, N+1, ... 518 /// this is needed because each iteration in the loop corresponds to a SIMD 519 /// element. 520 virtual Value *getBroadcastInstrs(Value *V); 521 522 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 523 /// to each vector element of Val. The sequence starts at StartIndex. 524 /// \p Opcode is relevant for FP induction variable. 525 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 526 Instruction::BinaryOps Opcode = 527 Instruction::BinaryOpsEnd); 528 529 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 530 /// variable on which to base the steps, \p Step is the size of the step, and 531 /// \p EntryVal is the value from the original loop that maps to the steps. 532 /// Note that \p EntryVal doesn't have to be an induction variable - it 533 /// can also be a truncate instruction. 534 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 535 const InductionDescriptor &ID); 536 537 /// Create a vector induction phi node based on an existing scalar one. \p 538 /// EntryVal is the value from the original loop that maps to the vector phi 539 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 540 /// truncate instruction, instead of widening the original IV, we widen a 541 /// version of the IV truncated to \p EntryVal's type. 542 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 543 Value *Step, Instruction *EntryVal); 544 545 /// Returns true if an instruction \p I should be scalarized instead of 546 /// vectorized for the chosen vectorization factor. 547 bool shouldScalarizeInstruction(Instruction *I) const; 548 549 /// Returns true if we should generate a scalar version of \p IV. 550 bool needsScalarInduction(Instruction *IV) const; 551 552 /// If there is a cast involved in the induction variable \p ID, which should 553 /// be ignored in the vectorized loop body, this function records the 554 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 555 /// cast. We had already proved that the casted Phi is equal to the uncasted 556 /// Phi in the vectorized loop (under a runtime guard), and therefore 557 /// there is no need to vectorize the cast - the same value can be used in the 558 /// vector loop for both the Phi and the cast. 559 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 560 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 561 /// 562 /// \p EntryVal is the value from the original loop that maps to the vector 563 /// phi node and is used to distinguish what is the IV currently being 564 /// processed - original one (if \p EntryVal is a phi corresponding to the 565 /// original IV) or the "newly-created" one based on the proof mentioned above 566 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 567 /// latter case \p EntryVal is a TruncInst and we must not record anything for 568 /// that IV, but it's error-prone to expect callers of this routine to care 569 /// about that, hence this explicit parameter. 570 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 571 const Instruction *EntryVal, 572 Value *VectorLoopValue, 573 unsigned Part, 574 unsigned Lane = UINT_MAX); 575 576 /// Generate a shuffle sequence that will reverse the vector Vec. 577 virtual Value *reverseVector(Value *Vec); 578 579 /// Returns (and creates if needed) the original loop trip count. 580 Value *getOrCreateTripCount(Loop *NewLoop); 581 582 /// Returns (and creates if needed) the trip count of the widened loop. 583 Value *getOrCreateVectorTripCount(Loop *NewLoop); 584 585 /// Returns a bitcasted value to the requested vector type. 586 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 587 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 588 const DataLayout &DL); 589 590 /// Emit a bypass check to see if the vector trip count is zero, including if 591 /// it overflows. 592 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 593 594 /// Emit a bypass check to see if all of the SCEV assumptions we've 595 /// had to make are correct. 596 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 597 598 /// Emit bypass checks to check any memory assumptions we may have made. 599 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 600 601 /// Compute the transformed value of Index at offset StartValue using step 602 /// StepValue. 603 /// For integer induction, returns StartValue + Index * StepValue. 604 /// For pointer induction, returns StartValue[Index * StepValue]. 605 /// FIXME: The newly created binary instructions should contain nsw/nuw 606 /// flags, which can be found from the original scalar operations. 607 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 608 const DataLayout &DL, 609 const InductionDescriptor &ID) const; 610 611 /// Add additional metadata to \p To that was not present on \p Orig. 612 /// 613 /// Currently this is used to add the noalias annotations based on the 614 /// inserted memchecks. Use this for instructions that are *cloned* into the 615 /// vector loop. 616 void addNewMetadata(Instruction *To, const Instruction *Orig); 617 618 /// Add metadata from one instruction to another. 619 /// 620 /// This includes both the original MDs from \p From and additional ones (\see 621 /// addNewMetadata). Use this for *newly created* instructions in the vector 622 /// loop. 623 void addMetadata(Instruction *To, Instruction *From); 624 625 /// Similar to the previous function but it adds the metadata to a 626 /// vector of instructions. 627 void addMetadata(ArrayRef<Value *> To, Instruction *From); 628 629 /// The original loop. 630 Loop *OrigLoop; 631 632 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 633 /// dynamic knowledge to simplify SCEV expressions and converts them to a 634 /// more usable form. 635 PredicatedScalarEvolution &PSE; 636 637 /// Loop Info. 638 LoopInfo *LI; 639 640 /// Dominator Tree. 641 DominatorTree *DT; 642 643 /// Alias Analysis. 644 AliasAnalysis *AA; 645 646 /// Target Library Info. 647 const TargetLibraryInfo *TLI; 648 649 /// Target Transform Info. 650 const TargetTransformInfo *TTI; 651 652 /// Assumption Cache. 653 AssumptionCache *AC; 654 655 /// Interface to emit optimization remarks. 656 OptimizationRemarkEmitter *ORE; 657 658 /// LoopVersioning. It's only set up (non-null) if memchecks were 659 /// used. 660 /// 661 /// This is currently only used to add no-alias metadata based on the 662 /// memchecks. The actually versioning is performed manually. 663 std::unique_ptr<LoopVersioning> LVer; 664 665 /// The vectorization SIMD factor to use. Each vector will have this many 666 /// vector elements. 667 unsigned VF; 668 669 /// The vectorization unroll factor to use. Each scalar is vectorized to this 670 /// many different vector instructions. 671 unsigned UF; 672 673 /// The builder that we use 674 IRBuilder<> Builder; 675 676 // --- Vectorization state --- 677 678 /// The vector-loop preheader. 679 BasicBlock *LoopVectorPreHeader; 680 681 /// The scalar-loop preheader. 682 BasicBlock *LoopScalarPreHeader; 683 684 /// Middle Block between the vector and the scalar. 685 BasicBlock *LoopMiddleBlock; 686 687 /// The ExitBlock of the scalar loop. 688 BasicBlock *LoopExitBlock; 689 690 /// The vector loop body. 691 BasicBlock *LoopVectorBody; 692 693 /// The scalar loop body. 694 BasicBlock *LoopScalarBody; 695 696 /// A list of all bypass blocks. The first block is the entry of the loop. 697 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 698 699 /// The new Induction variable which was added to the new block. 700 PHINode *Induction = nullptr; 701 702 /// The induction variable of the old basic block. 703 PHINode *OldInduction = nullptr; 704 705 /// Maps values from the original loop to their corresponding values in the 706 /// vectorized loop. A key value can map to either vector values, scalar 707 /// values or both kinds of values, depending on whether the key was 708 /// vectorized and scalarized. 709 VectorizerValueMap VectorLoopValueMap; 710 711 /// Store instructions that were predicated. 712 SmallVector<Instruction *, 4> PredicatedInstructions; 713 714 /// Trip count of the original loop. 715 Value *TripCount = nullptr; 716 717 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 718 Value *VectorTripCount = nullptr; 719 720 /// The legality analysis. 721 LoopVectorizationLegality *Legal; 722 723 /// The profitablity analysis. 724 LoopVectorizationCostModel *Cost; 725 726 // Record whether runtime checks are added. 727 bool AddedSafetyChecks = false; 728 729 // Holds the end values for each induction variable. We save the end values 730 // so we can later fix-up the external users of the induction variables. 731 DenseMap<PHINode *, Value *> IVEndValues; 732 733 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 734 // fixed up at the end of vector code generation. 735 SmallVector<PHINode *, 8> OrigPHIsToFix; 736 }; 737 738 class InnerLoopUnroller : public InnerLoopVectorizer { 739 public: 740 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 741 LoopInfo *LI, DominatorTree *DT, 742 const TargetLibraryInfo *TLI, 743 const TargetTransformInfo *TTI, AssumptionCache *AC, 744 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 745 LoopVectorizationLegality *LVL, 746 LoopVectorizationCostModel *CM) 747 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 748 UnrollFactor, LVL, CM) {} 749 750 private: 751 Value *getBroadcastInstrs(Value *V) override; 752 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 753 Instruction::BinaryOps Opcode = 754 Instruction::BinaryOpsEnd) override; 755 Value *reverseVector(Value *Vec) override; 756 }; 757 758 } // end namespace llvm 759 760 /// Look for a meaningful debug location on the instruction or it's 761 /// operands. 762 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 763 if (!I) 764 return I; 765 766 DebugLoc Empty; 767 if (I->getDebugLoc() != Empty) 768 return I; 769 770 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 771 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 772 if (OpInst->getDebugLoc() != Empty) 773 return OpInst; 774 } 775 776 return I; 777 } 778 779 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 780 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 781 const DILocation *DIL = Inst->getDebugLoc(); 782 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 783 !isa<DbgInfoIntrinsic>(Inst)) { 784 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 785 if (NewDIL) 786 B.SetCurrentDebugLocation(NewDIL.getValue()); 787 else 788 LLVM_DEBUG(dbgs() 789 << "Failed to create new discriminator: " 790 << DIL->getFilename() << " Line: " << DIL->getLine()); 791 } 792 else 793 B.SetCurrentDebugLocation(DIL); 794 } else 795 B.SetCurrentDebugLocation(DebugLoc()); 796 } 797 798 /// Write a record \p DebugMsg about vectorization failure to the debug 799 /// output stream. If \p I is passed, it is an instruction that prevents 800 /// vectorization. 801 #ifndef NDEBUG 802 static void debugVectorizationFailure(const StringRef DebugMsg, 803 Instruction *I) { 804 dbgs() << "LV: Not vectorizing: " << DebugMsg; 805 if (I != nullptr) 806 dbgs() << " " << *I; 807 else 808 dbgs() << '.'; 809 dbgs() << '\n'; 810 } 811 #endif 812 813 /// Create an analysis remark that explains why vectorization failed 814 /// 815 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 816 /// RemarkName is the identifier for the remark. If \p I is passed it is an 817 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 818 /// the location of the remark. \return the remark object that can be 819 /// streamed to. 820 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 821 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 822 Value *CodeRegion = TheLoop->getHeader(); 823 DebugLoc DL = TheLoop->getStartLoc(); 824 825 if (I) { 826 CodeRegion = I->getParent(); 827 // If there is no debug location attached to the instruction, revert back to 828 // using the loop's. 829 if (I->getDebugLoc()) 830 DL = I->getDebugLoc(); 831 } 832 833 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 834 R << "loop not vectorized: "; 835 return R; 836 } 837 838 namespace llvm { 839 840 void reportVectorizationFailure(const StringRef DebugMsg, 841 const StringRef OREMsg, const StringRef ORETag, 842 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 843 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 844 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 845 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 846 ORETag, TheLoop, I) << OREMsg); 847 } 848 849 } // end namespace llvm 850 851 #ifndef NDEBUG 852 /// \return string containing a file name and a line # for the given loop. 853 static std::string getDebugLocString(const Loop *L) { 854 std::string Result; 855 if (L) { 856 raw_string_ostream OS(Result); 857 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 858 LoopDbgLoc.print(OS); 859 else 860 // Just print the module name. 861 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 862 OS.flush(); 863 } 864 return Result; 865 } 866 #endif 867 868 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 869 const Instruction *Orig) { 870 // If the loop was versioned with memchecks, add the corresponding no-alias 871 // metadata. 872 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 873 LVer->annotateInstWithNoAlias(To, Orig); 874 } 875 876 void InnerLoopVectorizer::addMetadata(Instruction *To, 877 Instruction *From) { 878 propagateMetadata(To, From); 879 addNewMetadata(To, From); 880 } 881 882 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 883 Instruction *From) { 884 for (Value *V : To) { 885 if (Instruction *I = dyn_cast<Instruction>(V)) 886 addMetadata(I, From); 887 } 888 } 889 890 namespace llvm { 891 892 // Loop vectorization cost-model hints how the scalar epilogue loop should be 893 // lowered. 894 enum ScalarEpilogueLowering { 895 896 // The default: allowing scalar epilogues. 897 CM_ScalarEpilogueAllowed, 898 899 // Vectorization with OptForSize: don't allow epilogues. 900 CM_ScalarEpilogueNotAllowedOptSize, 901 902 // A special case of vectorisation with OptForSize: loops with a very small 903 // trip count are considered for vectorization under OptForSize, thereby 904 // making sure the cost of their loop body is dominant, free of runtime 905 // guards and scalar iteration overheads. 906 CM_ScalarEpilogueNotAllowedLowTripLoop, 907 908 // Loop hint predicate indicating an epilogue is undesired. 909 CM_ScalarEpilogueNotNeededPredicatePragma 910 }; 911 912 /// LoopVectorizationCostModel - estimates the expected speedups due to 913 /// vectorization. 914 /// In many cases vectorization is not profitable. This can happen because of 915 /// a number of reasons. In this class we mainly attempt to predict the 916 /// expected speedup/slowdowns due to the supported instruction set. We use the 917 /// TargetTransformInfo to query the different backends for the cost of 918 /// different operations. 919 class LoopVectorizationCostModel { 920 public: 921 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 922 PredicatedScalarEvolution &PSE, LoopInfo *LI, 923 LoopVectorizationLegality *Legal, 924 const TargetTransformInfo &TTI, 925 const TargetLibraryInfo *TLI, DemandedBits *DB, 926 AssumptionCache *AC, 927 OptimizationRemarkEmitter *ORE, const Function *F, 928 const LoopVectorizeHints *Hints, 929 InterleavedAccessInfo &IAI) 930 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 931 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 932 Hints(Hints), InterleaveInfo(IAI) {} 933 934 /// \return An upper bound for the vectorization factor, or None if 935 /// vectorization and interleaving should be avoided up front. 936 Optional<unsigned> computeMaxVF(); 937 938 /// \return True if runtime checks are required for vectorization, and false 939 /// otherwise. 940 bool runtimeChecksRequired(); 941 942 /// \return The most profitable vectorization factor and the cost of that VF. 943 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 944 /// then this vectorization factor will be selected if vectorization is 945 /// possible. 946 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 947 948 /// Setup cost-based decisions for user vectorization factor. 949 void selectUserVectorizationFactor(unsigned UserVF) { 950 collectUniformsAndScalars(UserVF); 951 collectInstsToScalarize(UserVF); 952 } 953 954 /// \return The size (in bits) of the smallest and widest types in the code 955 /// that needs to be vectorized. We ignore values that remain scalar such as 956 /// 64 bit loop indices. 957 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 958 959 /// \return The desired interleave count. 960 /// If interleave count has been specified by metadata it will be returned. 961 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 962 /// are the selected vectorization factor and the cost of the selected VF. 963 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 964 965 /// Memory access instruction may be vectorized in more than one way. 966 /// Form of instruction after vectorization depends on cost. 967 /// This function takes cost-based decisions for Load/Store instructions 968 /// and collects them in a map. This decisions map is used for building 969 /// the lists of loop-uniform and loop-scalar instructions. 970 /// The calculated cost is saved with widening decision in order to 971 /// avoid redundant calculations. 972 void setCostBasedWideningDecision(unsigned VF); 973 974 /// A struct that represents some properties of the register usage 975 /// of a loop. 976 struct RegisterUsage { 977 /// Holds the number of loop invariant values that are used in the loop. 978 unsigned LoopInvariantRegs; 979 980 /// Holds the maximum number of concurrent live intervals in the loop. 981 unsigned MaxLocalUsers; 982 }; 983 984 /// \return Returns information about the register usages of the loop for the 985 /// given vectorization factors. 986 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 987 988 /// Collect values we want to ignore in the cost model. 989 void collectValuesToIgnore(); 990 991 /// \returns The smallest bitwidth each instruction can be represented with. 992 /// The vector equivalents of these instructions should be truncated to this 993 /// type. 994 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 995 return MinBWs; 996 } 997 998 /// \returns True if it is more profitable to scalarize instruction \p I for 999 /// vectorization factor \p VF. 1000 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1001 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1002 1003 // Cost model is not run in the VPlan-native path - return conservative 1004 // result until this changes. 1005 if (EnableVPlanNativePath) 1006 return false; 1007 1008 auto Scalars = InstsToScalarize.find(VF); 1009 assert(Scalars != InstsToScalarize.end() && 1010 "VF not yet analyzed for scalarization profitability"); 1011 return Scalars->second.find(I) != Scalars->second.end(); 1012 } 1013 1014 /// Returns true if \p I is known to be uniform after vectorization. 1015 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1016 if (VF == 1) 1017 return true; 1018 1019 // Cost model is not run in the VPlan-native path - return conservative 1020 // result until this changes. 1021 if (EnableVPlanNativePath) 1022 return false; 1023 1024 auto UniformsPerVF = Uniforms.find(VF); 1025 assert(UniformsPerVF != Uniforms.end() && 1026 "VF not yet analyzed for uniformity"); 1027 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); 1028 } 1029 1030 /// Returns true if \p I is known to be scalar after vectorization. 1031 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1032 if (VF == 1) 1033 return true; 1034 1035 // Cost model is not run in the VPlan-native path - return conservative 1036 // result until this changes. 1037 if (EnableVPlanNativePath) 1038 return false; 1039 1040 auto ScalarsPerVF = Scalars.find(VF); 1041 assert(ScalarsPerVF != Scalars.end() && 1042 "Scalar values are not calculated for VF"); 1043 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); 1044 } 1045 1046 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1047 /// for vectorization factor \p VF. 1048 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1049 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1050 !isProfitableToScalarize(I, VF) && 1051 !isScalarAfterVectorization(I, VF); 1052 } 1053 1054 /// Decision that was taken during cost calculation for memory instruction. 1055 enum InstWidening { 1056 CM_Unknown, 1057 CM_Widen, // For consecutive accesses with stride +1. 1058 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1059 CM_Interleave, 1060 CM_GatherScatter, 1061 CM_Scalarize 1062 }; 1063 1064 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1065 /// instruction \p I and vector width \p VF. 1066 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1067 unsigned Cost) { 1068 assert(VF >= 2 && "Expected VF >=2"); 1069 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1070 } 1071 1072 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1073 /// interleaving group \p Grp and vector width \p VF. 1074 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1075 InstWidening W, unsigned Cost) { 1076 assert(VF >= 2 && "Expected VF >=2"); 1077 /// Broadcast this decicion to all instructions inside the group. 1078 /// But the cost will be assigned to one instruction only. 1079 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1080 if (auto *I = Grp->getMember(i)) { 1081 if (Grp->getInsertPos() == I) 1082 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1083 else 1084 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1085 } 1086 } 1087 } 1088 1089 /// Return the cost model decision for the given instruction \p I and vector 1090 /// width \p VF. Return CM_Unknown if this instruction did not pass 1091 /// through the cost modeling. 1092 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1093 assert(VF >= 2 && "Expected VF >=2"); 1094 1095 // Cost model is not run in the VPlan-native path - return conservative 1096 // result until this changes. 1097 if (EnableVPlanNativePath) 1098 return CM_GatherScatter; 1099 1100 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1101 auto Itr = WideningDecisions.find(InstOnVF); 1102 if (Itr == WideningDecisions.end()) 1103 return CM_Unknown; 1104 return Itr->second.first; 1105 } 1106 1107 /// Return the vectorization cost for the given instruction \p I and vector 1108 /// width \p VF. 1109 unsigned getWideningCost(Instruction *I, unsigned VF) { 1110 assert(VF >= 2 && "Expected VF >=2"); 1111 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1112 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1113 "The cost is not calculated"); 1114 return WideningDecisions[InstOnVF].second; 1115 } 1116 1117 /// Return True if instruction \p I is an optimizable truncate whose operand 1118 /// is an induction variable. Such a truncate will be removed by adding a new 1119 /// induction variable with the destination type. 1120 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1121 // If the instruction is not a truncate, return false. 1122 auto *Trunc = dyn_cast<TruncInst>(I); 1123 if (!Trunc) 1124 return false; 1125 1126 // Get the source and destination types of the truncate. 1127 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1128 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1129 1130 // If the truncate is free for the given types, return false. Replacing a 1131 // free truncate with an induction variable would add an induction variable 1132 // update instruction to each iteration of the loop. We exclude from this 1133 // check the primary induction variable since it will need an update 1134 // instruction regardless. 1135 Value *Op = Trunc->getOperand(0); 1136 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1137 return false; 1138 1139 // If the truncated value is not an induction variable, return false. 1140 return Legal->isInductionPhi(Op); 1141 } 1142 1143 /// Collects the instructions to scalarize for each predicated instruction in 1144 /// the loop. 1145 void collectInstsToScalarize(unsigned VF); 1146 1147 /// Collect Uniform and Scalar values for the given \p VF. 1148 /// The sets depend on CM decision for Load/Store instructions 1149 /// that may be vectorized as interleave, gather-scatter or scalarized. 1150 void collectUniformsAndScalars(unsigned VF) { 1151 // Do the analysis once. 1152 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1153 return; 1154 setCostBasedWideningDecision(VF); 1155 collectLoopUniforms(VF); 1156 collectLoopScalars(VF); 1157 } 1158 1159 /// Returns true if the target machine supports masked store operation 1160 /// for the given \p DataType and kind of access to \p Ptr. 1161 bool isLegalMaskedStore(Type *DataType, Value *Ptr) { 1162 return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType); 1163 } 1164 1165 /// Returns true if the target machine supports masked load operation 1166 /// for the given \p DataType and kind of access to \p Ptr. 1167 bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { 1168 return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType); 1169 } 1170 1171 /// Returns true if the target machine supports masked scatter operation 1172 /// for the given \p DataType. 1173 bool isLegalMaskedScatter(Type *DataType) { 1174 return TTI.isLegalMaskedScatter(DataType); 1175 } 1176 1177 /// Returns true if the target machine supports masked gather operation 1178 /// for the given \p DataType. 1179 bool isLegalMaskedGather(Type *DataType) { 1180 return TTI.isLegalMaskedGather(DataType); 1181 } 1182 1183 /// Returns true if the target machine can represent \p V as a masked gather 1184 /// or scatter operation. 1185 bool isLegalGatherOrScatter(Value *V) { 1186 bool LI = isa<LoadInst>(V); 1187 bool SI = isa<StoreInst>(V); 1188 if (!LI && !SI) 1189 return false; 1190 auto *Ty = getMemInstValueType(V); 1191 return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); 1192 } 1193 1194 /// Returns true if \p I is an instruction that will be scalarized with 1195 /// predication. Such instructions include conditional stores and 1196 /// instructions that may divide by zero. 1197 /// If a non-zero VF has been calculated, we check if I will be scalarized 1198 /// predication for that VF. 1199 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1200 1201 // Returns true if \p I is an instruction that will be predicated either 1202 // through scalar predication or masked load/store or masked gather/scatter. 1203 // Superset of instructions that return true for isScalarWithPredication. 1204 bool isPredicatedInst(Instruction *I) { 1205 if (!blockNeedsPredication(I->getParent())) 1206 return false; 1207 // Loads and stores that need some form of masked operation are predicated 1208 // instructions. 1209 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1210 return Legal->isMaskRequired(I); 1211 return isScalarWithPredication(I); 1212 } 1213 1214 /// Returns true if \p I is a memory instruction with consecutive memory 1215 /// access that can be widened. 1216 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1217 1218 /// Returns true if \p I is a memory instruction in an interleaved-group 1219 /// of memory accesses that can be vectorized with wide vector loads/stores 1220 /// and shuffles. 1221 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1222 1223 /// Check if \p Instr belongs to any interleaved access group. 1224 bool isAccessInterleaved(Instruction *Instr) { 1225 return InterleaveInfo.isInterleaved(Instr); 1226 } 1227 1228 /// Get the interleaved access group that \p Instr belongs to. 1229 const InterleaveGroup<Instruction> * 1230 getInterleavedAccessGroup(Instruction *Instr) { 1231 return InterleaveInfo.getInterleaveGroup(Instr); 1232 } 1233 1234 /// Returns true if an interleaved group requires a scalar iteration 1235 /// to handle accesses with gaps, and there is nothing preventing us from 1236 /// creating a scalar epilogue. 1237 bool requiresScalarEpilogue() const { 1238 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1239 } 1240 1241 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1242 /// loop hint annotation. 1243 bool isScalarEpilogueAllowed() const { 1244 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1245 } 1246 1247 /// Returns true if all loop blocks should be masked to fold tail loop. 1248 bool foldTailByMasking() const { return FoldTailByMasking; } 1249 1250 bool blockNeedsPredication(BasicBlock *BB) { 1251 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1252 } 1253 1254 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1255 /// with factor VF. Return the cost of the instruction, including 1256 /// scalarization overhead if it's needed. 1257 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1258 1259 /// Estimate cost of a call instruction CI if it were vectorized with factor 1260 /// VF. Return the cost of the instruction, including scalarization overhead 1261 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1262 /// scalarized - 1263 /// i.e. either vector version isn't available, or is too expensive. 1264 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1265 1266 private: 1267 unsigned NumPredStores = 0; 1268 1269 /// \return An upper bound for the vectorization factor, larger than zero. 1270 /// One is returned if vectorization should best be avoided due to cost. 1271 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1272 1273 /// The vectorization cost is a combination of the cost itself and a boolean 1274 /// indicating whether any of the contributing operations will actually 1275 /// operate on 1276 /// vector values after type legalization in the backend. If this latter value 1277 /// is 1278 /// false, then all operations will be scalarized (i.e. no vectorization has 1279 /// actually taken place). 1280 using VectorizationCostTy = std::pair<unsigned, bool>; 1281 1282 /// Returns the expected execution cost. The unit of the cost does 1283 /// not matter because we use the 'cost' units to compare different 1284 /// vector widths. The cost that is returned is *not* normalized by 1285 /// the factor width. 1286 VectorizationCostTy expectedCost(unsigned VF); 1287 1288 /// Returns the execution time cost of an instruction for a given vector 1289 /// width. Vector width of one means scalar. 1290 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1291 1292 /// The cost-computation logic from getInstructionCost which provides 1293 /// the vector type as an output parameter. 1294 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1295 1296 /// Calculate vectorization cost of memory instruction \p I. 1297 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1298 1299 /// The cost computation for scalarized memory instruction. 1300 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1301 1302 /// The cost computation for interleaving group of memory instructions. 1303 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1304 1305 /// The cost computation for Gather/Scatter instruction. 1306 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1307 1308 /// The cost computation for widening instruction \p I with consecutive 1309 /// memory access. 1310 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1311 1312 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1313 /// Load: scalar load + broadcast. 1314 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1315 /// element) 1316 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1317 1318 /// Estimate the overhead of scalarizing an instruction. This is a 1319 /// convenience wrapper for the type-based getScalarizationOverhead API. 1320 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1321 1322 /// Returns whether the instruction is a load or store and will be a emitted 1323 /// as a vector operation. 1324 bool isConsecutiveLoadOrStore(Instruction *I); 1325 1326 /// Returns true if an artificially high cost for emulated masked memrefs 1327 /// should be used. 1328 bool useEmulatedMaskMemRefHack(Instruction *I); 1329 1330 /// Map of scalar integer values to the smallest bitwidth they can be legally 1331 /// represented as. The vector equivalents of these values should be truncated 1332 /// to this type. 1333 MapVector<Instruction *, uint64_t> MinBWs; 1334 1335 /// A type representing the costs for instructions if they were to be 1336 /// scalarized rather than vectorized. The entries are Instruction-Cost 1337 /// pairs. 1338 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1339 1340 /// A set containing all BasicBlocks that are known to present after 1341 /// vectorization as a predicated block. 1342 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1343 1344 /// Records whether it is allowed to have the original scalar loop execute at 1345 /// least once. This may be needed as a fallback loop in case runtime 1346 /// aliasing/dependence checks fail, or to handle the tail/remainder 1347 /// iterations when the trip count is unknown or doesn't divide by the VF, 1348 /// or as a peel-loop to handle gaps in interleave-groups. 1349 /// Under optsize and when the trip count is very small we don't allow any 1350 /// iterations to execute in the scalar loop. 1351 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1352 1353 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1354 bool FoldTailByMasking = false; 1355 1356 /// A map holding scalar costs for different vectorization factors. The 1357 /// presence of a cost for an instruction in the mapping indicates that the 1358 /// instruction will be scalarized when vectorizing with the associated 1359 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1360 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1361 1362 /// Holds the instructions known to be uniform after vectorization. 1363 /// The data is collected per VF. 1364 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1365 1366 /// Holds the instructions known to be scalar after vectorization. 1367 /// The data is collected per VF. 1368 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1369 1370 /// Holds the instructions (address computations) that are forced to be 1371 /// scalarized. 1372 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1373 1374 /// Returns the expected difference in cost from scalarizing the expression 1375 /// feeding a predicated instruction \p PredInst. The instructions to 1376 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1377 /// non-negative return value implies the expression will be scalarized. 1378 /// Currently, only single-use chains are considered for scalarization. 1379 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1380 unsigned VF); 1381 1382 /// Collect the instructions that are uniform after vectorization. An 1383 /// instruction is uniform if we represent it with a single scalar value in 1384 /// the vectorized loop corresponding to each vector iteration. Examples of 1385 /// uniform instructions include pointer operands of consecutive or 1386 /// interleaved memory accesses. Note that although uniformity implies an 1387 /// instruction will be scalar, the reverse is not true. In general, a 1388 /// scalarized instruction will be represented by VF scalar values in the 1389 /// vectorized loop, each corresponding to an iteration of the original 1390 /// scalar loop. 1391 void collectLoopUniforms(unsigned VF); 1392 1393 /// Collect the instructions that are scalar after vectorization. An 1394 /// instruction is scalar if it is known to be uniform or will be scalarized 1395 /// during vectorization. Non-uniform scalarized instructions will be 1396 /// represented by VF values in the vectorized loop, each corresponding to an 1397 /// iteration of the original scalar loop. 1398 void collectLoopScalars(unsigned VF); 1399 1400 /// Keeps cost model vectorization decision and cost for instructions. 1401 /// Right now it is used for memory instructions only. 1402 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1403 std::pair<InstWidening, unsigned>>; 1404 1405 DecisionList WideningDecisions; 1406 1407 /// Returns true if \p V is expected to be vectorized and it needs to be 1408 /// extracted. 1409 bool needsExtract(Value *V, unsigned VF) const { 1410 Instruction *I = dyn_cast<Instruction>(V); 1411 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1412 return false; 1413 1414 // Assume we can vectorize V (and hence we need extraction) if the 1415 // scalars are not computed yet. This can happen, because it is called 1416 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1417 // the scalars are collected. That should be a safe assumption in most 1418 // cases, because we check if the operands have vectorizable types 1419 // beforehand in LoopVectorizationLegality. 1420 return Scalars.find(VF) == Scalars.end() || 1421 !isScalarAfterVectorization(I, VF); 1422 }; 1423 1424 /// Returns a range containing only operands needing to be extracted. 1425 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1426 unsigned VF) { 1427 return SmallVector<Value *, 4>(make_filter_range( 1428 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1429 } 1430 1431 public: 1432 /// The loop that we evaluate. 1433 Loop *TheLoop; 1434 1435 /// Predicated scalar evolution analysis. 1436 PredicatedScalarEvolution &PSE; 1437 1438 /// Loop Info analysis. 1439 LoopInfo *LI; 1440 1441 /// Vectorization legality. 1442 LoopVectorizationLegality *Legal; 1443 1444 /// Vector target information. 1445 const TargetTransformInfo &TTI; 1446 1447 /// Target Library Info. 1448 const TargetLibraryInfo *TLI; 1449 1450 /// Demanded bits analysis. 1451 DemandedBits *DB; 1452 1453 /// Assumption cache. 1454 AssumptionCache *AC; 1455 1456 /// Interface to emit optimization remarks. 1457 OptimizationRemarkEmitter *ORE; 1458 1459 const Function *TheFunction; 1460 1461 /// Loop Vectorize Hint. 1462 const LoopVectorizeHints *Hints; 1463 1464 /// The interleave access information contains groups of interleaved accesses 1465 /// with the same stride and close to each other. 1466 InterleavedAccessInfo &InterleaveInfo; 1467 1468 /// Values to ignore in the cost model. 1469 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1470 1471 /// Values to ignore in the cost model when VF > 1. 1472 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1473 }; 1474 1475 } // end namespace llvm 1476 1477 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1478 // vectorization. The loop needs to be annotated with #pragma omp simd 1479 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1480 // vector length information is not provided, vectorization is not considered 1481 // explicit. Interleave hints are not allowed either. These limitations will be 1482 // relaxed in the future. 1483 // Please, note that we are currently forced to abuse the pragma 'clang 1484 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1485 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1486 // provides *explicit vectorization hints* (LV can bypass legal checks and 1487 // assume that vectorization is legal). However, both hints are implemented 1488 // using the same metadata (llvm.loop.vectorize, processed by 1489 // LoopVectorizeHints). This will be fixed in the future when the native IR 1490 // representation for pragma 'omp simd' is introduced. 1491 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1492 OptimizationRemarkEmitter *ORE) { 1493 assert(!OuterLp->empty() && "This is not an outer loop"); 1494 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1495 1496 // Only outer loops with an explicit vectorization hint are supported. 1497 // Unannotated outer loops are ignored. 1498 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1499 return false; 1500 1501 Function *Fn = OuterLp->getHeader()->getParent(); 1502 if (!Hints.allowVectorization(Fn, OuterLp, 1503 true /*VectorizeOnlyWhenForced*/)) { 1504 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1505 return false; 1506 } 1507 1508 if (Hints.getInterleave() > 1) { 1509 // TODO: Interleave support is future work. 1510 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1511 "outer loops.\n"); 1512 Hints.emitRemarkWithHints(); 1513 return false; 1514 } 1515 1516 return true; 1517 } 1518 1519 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1520 OptimizationRemarkEmitter *ORE, 1521 SmallVectorImpl<Loop *> &V) { 1522 // Collect inner loops and outer loops without irreducible control flow. For 1523 // now, only collect outer loops that have explicit vectorization hints. If we 1524 // are stress testing the VPlan H-CFG construction, we collect the outermost 1525 // loop of every loop nest. 1526 if (L.empty() || VPlanBuildStressTest || 1527 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1528 LoopBlocksRPO RPOT(&L); 1529 RPOT.perform(LI); 1530 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1531 V.push_back(&L); 1532 // TODO: Collect inner loops inside marked outer loops in case 1533 // vectorization fails for the outer loop. Do not invoke 1534 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1535 // already known to be reducible. We can use an inherited attribute for 1536 // that. 1537 return; 1538 } 1539 } 1540 for (Loop *InnerL : L) 1541 collectSupportedLoops(*InnerL, LI, ORE, V); 1542 } 1543 1544 namespace { 1545 1546 /// The LoopVectorize Pass. 1547 struct LoopVectorize : public FunctionPass { 1548 /// Pass identification, replacement for typeid 1549 static char ID; 1550 1551 LoopVectorizePass Impl; 1552 1553 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1554 bool VectorizeOnlyWhenForced = false) 1555 : FunctionPass(ID) { 1556 Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; 1557 Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; 1558 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1559 } 1560 1561 bool runOnFunction(Function &F) override { 1562 if (skipFunction(F)) 1563 return false; 1564 1565 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1566 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1567 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1568 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1569 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1570 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1571 auto *TLI = TLIP ? &TLIP->getTLI() : nullptr; 1572 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1573 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1574 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1575 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1576 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1577 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1578 1579 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1580 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1581 1582 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1583 GetLAA, *ORE, PSI); 1584 } 1585 1586 void getAnalysisUsage(AnalysisUsage &AU) const override { 1587 AU.addRequired<AssumptionCacheTracker>(); 1588 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1589 AU.addRequired<DominatorTreeWrapperPass>(); 1590 AU.addRequired<LoopInfoWrapperPass>(); 1591 AU.addRequired<ScalarEvolutionWrapperPass>(); 1592 AU.addRequired<TargetTransformInfoWrapperPass>(); 1593 AU.addRequired<AAResultsWrapperPass>(); 1594 AU.addRequired<LoopAccessLegacyAnalysis>(); 1595 AU.addRequired<DemandedBitsWrapperPass>(); 1596 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1597 1598 // We currently do not preserve loopinfo/dominator analyses with outer loop 1599 // vectorization. Until this is addressed, mark these analyses as preserved 1600 // only for non-VPlan-native path. 1601 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1602 if (!EnableVPlanNativePath) { 1603 AU.addPreserved<LoopInfoWrapperPass>(); 1604 AU.addPreserved<DominatorTreeWrapperPass>(); 1605 } 1606 1607 AU.addPreserved<BasicAAWrapperPass>(); 1608 AU.addPreserved<GlobalsAAWrapperPass>(); 1609 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1610 } 1611 }; 1612 1613 } // end anonymous namespace 1614 1615 //===----------------------------------------------------------------------===// 1616 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1617 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1618 //===----------------------------------------------------------------------===// 1619 1620 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1621 // We need to place the broadcast of invariant variables outside the loop, 1622 // but only if it's proven safe to do so. Else, broadcast will be inside 1623 // vector loop body. 1624 Instruction *Instr = dyn_cast<Instruction>(V); 1625 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1626 (!Instr || 1627 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1628 // Place the code for broadcasting invariant variables in the new preheader. 1629 IRBuilder<>::InsertPointGuard Guard(Builder); 1630 if (SafeToHoist) 1631 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1632 1633 // Broadcast the scalar into all locations in the vector. 1634 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1635 1636 return Shuf; 1637 } 1638 1639 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1640 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1641 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1642 "Expected either an induction phi-node or a truncate of it!"); 1643 Value *Start = II.getStartValue(); 1644 1645 // Construct the initial value of the vector IV in the vector loop preheader 1646 auto CurrIP = Builder.saveIP(); 1647 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1648 if (isa<TruncInst>(EntryVal)) { 1649 assert(Start->getType()->isIntegerTy() && 1650 "Truncation requires an integer type"); 1651 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1652 Step = Builder.CreateTrunc(Step, TruncType); 1653 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1654 } 1655 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1656 Value *SteppedStart = 1657 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1658 1659 // We create vector phi nodes for both integer and floating-point induction 1660 // variables. Here, we determine the kind of arithmetic we will perform. 1661 Instruction::BinaryOps AddOp; 1662 Instruction::BinaryOps MulOp; 1663 if (Step->getType()->isIntegerTy()) { 1664 AddOp = Instruction::Add; 1665 MulOp = Instruction::Mul; 1666 } else { 1667 AddOp = II.getInductionOpcode(); 1668 MulOp = Instruction::FMul; 1669 } 1670 1671 // Multiply the vectorization factor by the step using integer or 1672 // floating-point arithmetic as appropriate. 1673 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1674 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1675 1676 // Create a vector splat to use in the induction update. 1677 // 1678 // FIXME: If the step is non-constant, we create the vector splat with 1679 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1680 // handle a constant vector splat. 1681 Value *SplatVF = isa<Constant>(Mul) 1682 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 1683 : Builder.CreateVectorSplat(VF, Mul); 1684 Builder.restoreIP(CurrIP); 1685 1686 // We may need to add the step a number of times, depending on the unroll 1687 // factor. The last of those goes into the PHI. 1688 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1689 &*LoopVectorBody->getFirstInsertionPt()); 1690 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1691 Instruction *LastInduction = VecInd; 1692 for (unsigned Part = 0; Part < UF; ++Part) { 1693 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1694 1695 if (isa<TruncInst>(EntryVal)) 1696 addMetadata(LastInduction, EntryVal); 1697 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1698 1699 LastInduction = cast<Instruction>(addFastMathFlag( 1700 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1701 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1702 } 1703 1704 // Move the last step to the end of the latch block. This ensures consistent 1705 // placement of all induction updates. 1706 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1707 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1708 auto *ICmp = cast<Instruction>(Br->getCondition()); 1709 LastInduction->moveBefore(ICmp); 1710 LastInduction->setName("vec.ind.next"); 1711 1712 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1713 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1714 } 1715 1716 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1717 return Cost->isScalarAfterVectorization(I, VF) || 1718 Cost->isProfitableToScalarize(I, VF); 1719 } 1720 1721 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1722 if (shouldScalarizeInstruction(IV)) 1723 return true; 1724 auto isScalarInst = [&](User *U) -> bool { 1725 auto *I = cast<Instruction>(U); 1726 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1727 }; 1728 return llvm::any_of(IV->users(), isScalarInst); 1729 } 1730 1731 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1732 const InductionDescriptor &ID, const Instruction *EntryVal, 1733 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1734 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1735 "Expected either an induction phi-node or a truncate of it!"); 1736 1737 // This induction variable is not the phi from the original loop but the 1738 // newly-created IV based on the proof that casted Phi is equal to the 1739 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1740 // re-uses the same InductionDescriptor that original IV uses but we don't 1741 // have to do any recording in this case - that is done when original IV is 1742 // processed. 1743 if (isa<TruncInst>(EntryVal)) 1744 return; 1745 1746 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1747 if (Casts.empty()) 1748 return; 1749 // Only the first Cast instruction in the Casts vector is of interest. 1750 // The rest of the Casts (if exist) have no uses outside the 1751 // induction update chain itself. 1752 Instruction *CastInst = *Casts.begin(); 1753 if (Lane < UINT_MAX) 1754 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1755 else 1756 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1757 } 1758 1759 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1760 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1761 "Primary induction variable must have an integer type"); 1762 1763 auto II = Legal->getInductionVars()->find(IV); 1764 assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); 1765 1766 auto ID = II->second; 1767 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1768 1769 // The scalar value to broadcast. This will be derived from the canonical 1770 // induction variable. 1771 Value *ScalarIV = nullptr; 1772 1773 // The value from the original loop to which we are mapping the new induction 1774 // variable. 1775 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1776 1777 // True if we have vectorized the induction variable. 1778 auto VectorizedIV = false; 1779 1780 // Determine if we want a scalar version of the induction variable. This is 1781 // true if the induction variable itself is not widened, or if it has at 1782 // least one user in the loop that is not widened. 1783 auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); 1784 1785 // Generate code for the induction step. Note that induction steps are 1786 // required to be loop-invariant 1787 assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && 1788 "Induction step should be loop invariant"); 1789 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1790 Value *Step = nullptr; 1791 if (PSE.getSE()->isSCEVable(IV->getType())) { 1792 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1793 Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), 1794 LoopVectorPreHeader->getTerminator()); 1795 } else { 1796 Step = cast<SCEVUnknown>(ID.getStep())->getValue(); 1797 } 1798 1799 // Try to create a new independent vector induction variable. If we can't 1800 // create the phi node, we will splat the scalar induction variable in each 1801 // loop iteration. 1802 if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) { 1803 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1804 VectorizedIV = true; 1805 } 1806 1807 // If we haven't yet vectorized the induction variable, or if we will create 1808 // a scalar one, we need to define the scalar induction variable and step 1809 // values. If we were given a truncation type, truncate the canonical 1810 // induction variable and step. Otherwise, derive these values from the 1811 // induction descriptor. 1812 if (!VectorizedIV || NeedsScalarIV) { 1813 ScalarIV = Induction; 1814 if (IV != OldInduction) { 1815 ScalarIV = IV->getType()->isIntegerTy() 1816 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1817 : Builder.CreateCast(Instruction::SIToFP, Induction, 1818 IV->getType()); 1819 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1820 ScalarIV->setName("offset.idx"); 1821 } 1822 if (Trunc) { 1823 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1824 assert(Step->getType()->isIntegerTy() && 1825 "Truncation requires an integer step"); 1826 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1827 Step = Builder.CreateTrunc(Step, TruncType); 1828 } 1829 } 1830 1831 // If we haven't yet vectorized the induction variable, splat the scalar 1832 // induction variable, and build the necessary step vectors. 1833 // TODO: Don't do it unless the vectorized IV is really required. 1834 if (!VectorizedIV) { 1835 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1836 for (unsigned Part = 0; Part < UF; ++Part) { 1837 Value *EntryPart = 1838 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1839 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1840 if (Trunc) 1841 addMetadata(EntryPart, Trunc); 1842 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1843 } 1844 } 1845 1846 // If an induction variable is only used for counting loop iterations or 1847 // calculating addresses, it doesn't need to be widened. Create scalar steps 1848 // that can be used by instructions we will later scalarize. Note that the 1849 // addition of the scalar steps will not increase the number of instructions 1850 // in the loop in the common case prior to InstCombine. We will be trading 1851 // one vector extract for each scalar step. 1852 if (NeedsScalarIV) 1853 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1854 } 1855 1856 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1857 Instruction::BinaryOps BinOp) { 1858 // Create and check the types. 1859 assert(Val->getType()->isVectorTy() && "Must be a vector"); 1860 int VLen = Val->getType()->getVectorNumElements(); 1861 1862 Type *STy = Val->getType()->getScalarType(); 1863 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1864 "Induction Step must be an integer or FP"); 1865 assert(Step->getType() == STy && "Step has wrong type"); 1866 1867 SmallVector<Constant *, 8> Indices; 1868 1869 if (STy->isIntegerTy()) { 1870 // Create a vector of consecutive numbers from zero to VF. 1871 for (int i = 0; i < VLen; ++i) 1872 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1873 1874 // Add the consecutive indices to the vector value. 1875 Constant *Cv = ConstantVector::get(Indices); 1876 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1877 Step = Builder.CreateVectorSplat(VLen, Step); 1878 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1879 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1880 // which can be found from the original scalar operations. 1881 Step = Builder.CreateMul(Cv, Step); 1882 return Builder.CreateAdd(Val, Step, "induction"); 1883 } 1884 1885 // Floating point induction. 1886 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1887 "Binary Opcode should be specified for FP induction"); 1888 // Create a vector of consecutive numbers from zero to VF. 1889 for (int i = 0; i < VLen; ++i) 1890 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1891 1892 // Add the consecutive indices to the vector value. 1893 Constant *Cv = ConstantVector::get(Indices); 1894 1895 Step = Builder.CreateVectorSplat(VLen, Step); 1896 1897 // Floating point operations had to be 'fast' to enable the induction. 1898 FastMathFlags Flags; 1899 Flags.setFast(); 1900 1901 Value *MulOp = Builder.CreateFMul(Cv, Step); 1902 if (isa<Instruction>(MulOp)) 1903 // Have to check, MulOp may be a constant 1904 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1905 1906 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1907 if (isa<Instruction>(BOp)) 1908 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1909 return BOp; 1910 } 1911 1912 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1913 Instruction *EntryVal, 1914 const InductionDescriptor &ID) { 1915 // We shouldn't have to build scalar steps if we aren't vectorizing. 1916 assert(VF > 1 && "VF should be greater than one"); 1917 1918 // Get the value type and ensure it and the step have the same integer type. 1919 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1920 assert(ScalarIVTy == Step->getType() && 1921 "Val and Step should have the same type"); 1922 1923 // We build scalar steps for both integer and floating-point induction 1924 // variables. Here, we determine the kind of arithmetic we will perform. 1925 Instruction::BinaryOps AddOp; 1926 Instruction::BinaryOps MulOp; 1927 if (ScalarIVTy->isIntegerTy()) { 1928 AddOp = Instruction::Add; 1929 MulOp = Instruction::Mul; 1930 } else { 1931 AddOp = ID.getInductionOpcode(); 1932 MulOp = Instruction::FMul; 1933 } 1934 1935 // Determine the number of scalars we need to generate for each unroll 1936 // iteration. If EntryVal is uniform, we only need to generate the first 1937 // lane. Otherwise, we generate all VF values. 1938 unsigned Lanes = 1939 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 1940 : VF; 1941 // Compute the scalar steps and save the results in VectorLoopValueMap. 1942 for (unsigned Part = 0; Part < UF; ++Part) { 1943 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 1944 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 1945 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 1946 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 1947 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 1948 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 1949 } 1950 } 1951 } 1952 1953 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 1954 assert(V != Induction && "The new induction variable should not be used."); 1955 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 1956 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 1957 1958 // If we have a stride that is replaced by one, do it here. Defer this for 1959 // the VPlan-native path until we start running Legal checks in that path. 1960 if (!EnableVPlanNativePath && Legal->hasStride(V)) 1961 V = ConstantInt::get(V->getType(), 1); 1962 1963 // If we have a vector mapped to this value, return it. 1964 if (VectorLoopValueMap.hasVectorValue(V, Part)) 1965 return VectorLoopValueMap.getVectorValue(V, Part); 1966 1967 // If the value has not been vectorized, check if it has been scalarized 1968 // instead. If it has been scalarized, and we actually need the value in 1969 // vector form, we will construct the vector values on demand. 1970 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 1971 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 1972 1973 // If we've scalarized a value, that value should be an instruction. 1974 auto *I = cast<Instruction>(V); 1975 1976 // If we aren't vectorizing, we can just copy the scalar map values over to 1977 // the vector map. 1978 if (VF == 1) { 1979 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 1980 return ScalarValue; 1981 } 1982 1983 // Get the last scalar instruction we generated for V and Part. If the value 1984 // is known to be uniform after vectorization, this corresponds to lane zero 1985 // of the Part unroll iteration. Otherwise, the last instruction is the one 1986 // we created for the last vector lane of the Part unroll iteration. 1987 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 1988 auto *LastInst = cast<Instruction>( 1989 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 1990 1991 // Set the insert point after the last scalarized instruction. This ensures 1992 // the insertelement sequence will directly follow the scalar definitions. 1993 auto OldIP = Builder.saveIP(); 1994 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 1995 Builder.SetInsertPoint(&*NewIP); 1996 1997 // However, if we are vectorizing, we need to construct the vector values. 1998 // If the value is known to be uniform after vectorization, we can just 1999 // broadcast the scalar value corresponding to lane zero for each unroll 2000 // iteration. Otherwise, we construct the vector values using insertelement 2001 // instructions. Since the resulting vectors are stored in 2002 // VectorLoopValueMap, we will only generate the insertelements once. 2003 Value *VectorValue = nullptr; 2004 if (Cost->isUniformAfterVectorization(I, VF)) { 2005 VectorValue = getBroadcastInstrs(ScalarValue); 2006 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2007 } else { 2008 // Initialize packing with insertelements to start from undef. 2009 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2010 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2011 for (unsigned Lane = 0; Lane < VF; ++Lane) 2012 packScalarIntoVectorValue(V, {Part, Lane}); 2013 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2014 } 2015 Builder.restoreIP(OldIP); 2016 return VectorValue; 2017 } 2018 2019 // If this scalar is unknown, assume that it is a constant or that it is 2020 // loop invariant. Broadcast V and save the value for future uses. 2021 Value *B = getBroadcastInstrs(V); 2022 VectorLoopValueMap.setVectorValue(V, Part, B); 2023 return B; 2024 } 2025 2026 Value * 2027 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2028 const VPIteration &Instance) { 2029 // If the value is not an instruction contained in the loop, it should 2030 // already be scalar. 2031 if (OrigLoop->isLoopInvariant(V)) 2032 return V; 2033 2034 assert(Instance.Lane > 0 2035 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2036 : true && "Uniform values only have lane zero"); 2037 2038 // If the value from the original loop has not been vectorized, it is 2039 // represented by UF x VF scalar values in the new loop. Return the requested 2040 // scalar value. 2041 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2042 return VectorLoopValueMap.getScalarValue(V, Instance); 2043 2044 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2045 // for the given unroll part. If this entry is not a vector type (i.e., the 2046 // vectorization factor is one), there is no need to generate an 2047 // extractelement instruction. 2048 auto *U = getOrCreateVectorValue(V, Instance.Part); 2049 if (!U->getType()->isVectorTy()) { 2050 assert(VF == 1 && "Value not scalarized has non-vector type"); 2051 return U; 2052 } 2053 2054 // Otherwise, the value from the original loop has been vectorized and is 2055 // represented by UF vector values. Extract and return the requested scalar 2056 // value from the appropriate vector lane. 2057 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2058 } 2059 2060 void InnerLoopVectorizer::packScalarIntoVectorValue( 2061 Value *V, const VPIteration &Instance) { 2062 assert(V != Induction && "The new induction variable should not be used."); 2063 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2064 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2065 2066 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2067 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2068 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2069 Builder.getInt32(Instance.Lane)); 2070 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2071 } 2072 2073 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2074 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2075 SmallVector<Constant *, 8> ShuffleMask; 2076 for (unsigned i = 0; i < VF; ++i) 2077 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 2078 2079 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2080 ConstantVector::get(ShuffleMask), 2081 "reverse"); 2082 } 2083 2084 // Return whether we allow using masked interleave-groups (for dealing with 2085 // strided loads/stores that reside in predicated blocks, or for dealing 2086 // with gaps). 2087 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2088 // If an override option has been passed in for interleaved accesses, use it. 2089 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2090 return EnableMaskedInterleavedMemAccesses; 2091 2092 return TTI.enableMaskedInterleavedAccessVectorization(); 2093 } 2094 2095 // Try to vectorize the interleave group that \p Instr belongs to. 2096 // 2097 // E.g. Translate following interleaved load group (factor = 3): 2098 // for (i = 0; i < N; i+=3) { 2099 // R = Pic[i]; // Member of index 0 2100 // G = Pic[i+1]; // Member of index 1 2101 // B = Pic[i+2]; // Member of index 2 2102 // ... // do something to R, G, B 2103 // } 2104 // To: 2105 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2106 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2107 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2108 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2109 // 2110 // Or translate following interleaved store group (factor = 3): 2111 // for (i = 0; i < N; i+=3) { 2112 // ... do something to R, G, B 2113 // Pic[i] = R; // Member of index 0 2114 // Pic[i+1] = G; // Member of index 1 2115 // Pic[i+2] = B; // Member of index 2 2116 // } 2117 // To: 2118 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2119 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2120 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2121 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2122 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2123 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, 2124 VectorParts *BlockInMask) { 2125 const InterleaveGroup<Instruction> *Group = 2126 Cost->getInterleavedAccessGroup(Instr); 2127 assert(Group && "Fail to get an interleaved access group."); 2128 2129 // Skip if current instruction is not the insert position. 2130 if (Instr != Group->getInsertPos()) 2131 return; 2132 2133 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2134 Value *Ptr = getLoadStorePointerOperand(Instr); 2135 2136 // Prepare for the vector type of the interleaved load/store. 2137 Type *ScalarTy = getMemInstValueType(Instr); 2138 unsigned InterleaveFactor = Group->getFactor(); 2139 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2140 Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr)); 2141 2142 // Prepare for the new pointers. 2143 setDebugLocFromInst(Builder, Ptr); 2144 SmallVector<Value *, 2> NewPtrs; 2145 unsigned Index = Group->getIndex(Instr); 2146 2147 VectorParts Mask; 2148 bool IsMaskForCondRequired = BlockInMask; 2149 if (IsMaskForCondRequired) { 2150 Mask = *BlockInMask; 2151 // TODO: extend the masked interleaved-group support to reversed access. 2152 assert(!Group->isReverse() && "Reversed masked interleave-group " 2153 "not supported."); 2154 } 2155 2156 // If the group is reverse, adjust the index to refer to the last vector lane 2157 // instead of the first. We adjust the index from the first vector lane, 2158 // rather than directly getting the pointer for lane VF - 1, because the 2159 // pointer operand of the interleaved access is supposed to be uniform. For 2160 // uniform instructions, we're only required to generate a value for the 2161 // first vector lane in each unroll iteration. 2162 if (Group->isReverse()) 2163 Index += (VF - 1) * Group->getFactor(); 2164 2165 bool InBounds = false; 2166 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2167 InBounds = gep->isInBounds(); 2168 2169 for (unsigned Part = 0; Part < UF; Part++) { 2170 Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); 2171 2172 // Notice current instruction could be any index. Need to adjust the address 2173 // to the member of index 0. 2174 // 2175 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2176 // b = A[i]; // Member of index 0 2177 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2178 // 2179 // E.g. A[i+1] = a; // Member of index 1 2180 // A[i] = b; // Member of index 0 2181 // A[i+2] = c; // Member of index 2 (Current instruction) 2182 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2183 NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index)); 2184 if (InBounds) 2185 cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true); 2186 2187 // Cast to the vector pointer type. 2188 NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); 2189 } 2190 2191 setDebugLocFromInst(Builder, Instr); 2192 Value *UndefVec = UndefValue::get(VecTy); 2193 2194 Value *MaskForGaps = nullptr; 2195 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2196 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2197 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2198 } 2199 2200 // Vectorize the interleaved load group. 2201 if (isa<LoadInst>(Instr)) { 2202 // For each unroll part, create a wide load for the group. 2203 SmallVector<Value *, 2> NewLoads; 2204 for (unsigned Part = 0; Part < UF; Part++) { 2205 Instruction *NewLoad; 2206 if (IsMaskForCondRequired || MaskForGaps) { 2207 assert(useMaskedInterleavedAccesses(*TTI) && 2208 "masked interleaved groups are not allowed."); 2209 Value *GroupMask = MaskForGaps; 2210 if (IsMaskForCondRequired) { 2211 auto *Undefs = UndefValue::get(Mask[Part]->getType()); 2212 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2213 Value *ShuffledMask = Builder.CreateShuffleVector( 2214 Mask[Part], Undefs, RepMask, "interleaved.mask"); 2215 GroupMask = MaskForGaps 2216 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2217 MaskForGaps) 2218 : ShuffledMask; 2219 } 2220 NewLoad = 2221 Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), 2222 GroupMask, UndefVec, "wide.masked.vec"); 2223 } 2224 else 2225 NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part], 2226 Group->getAlignment(), "wide.vec"); 2227 Group->addMetadata(NewLoad); 2228 NewLoads.push_back(NewLoad); 2229 } 2230 2231 // For each member in the group, shuffle out the appropriate data from the 2232 // wide loads. 2233 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2234 Instruction *Member = Group->getMember(I); 2235 2236 // Skip the gaps in the group. 2237 if (!Member) 2238 continue; 2239 2240 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); 2241 for (unsigned Part = 0; Part < UF; Part++) { 2242 Value *StridedVec = Builder.CreateShuffleVector( 2243 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2244 2245 // If this member has different type, cast the result type. 2246 if (Member->getType() != ScalarTy) { 2247 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2248 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2249 } 2250 2251 if (Group->isReverse()) 2252 StridedVec = reverseVector(StridedVec); 2253 2254 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2255 } 2256 } 2257 return; 2258 } 2259 2260 // The sub vector type for current instruction. 2261 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2262 2263 // Vectorize the interleaved store group. 2264 for (unsigned Part = 0; Part < UF; Part++) { 2265 // Collect the stored vector from each member. 2266 SmallVector<Value *, 4> StoredVecs; 2267 for (unsigned i = 0; i < InterleaveFactor; i++) { 2268 // Interleaved store group doesn't allow a gap, so each index has a member 2269 Instruction *Member = Group->getMember(i); 2270 assert(Member && "Fail to get a member from an interleaved store group"); 2271 2272 Value *StoredVec = getOrCreateVectorValue( 2273 cast<StoreInst>(Member)->getValueOperand(), Part); 2274 if (Group->isReverse()) 2275 StoredVec = reverseVector(StoredVec); 2276 2277 // If this member has different type, cast it to a unified type. 2278 2279 if (StoredVec->getType() != SubVT) 2280 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2281 2282 StoredVecs.push_back(StoredVec); 2283 } 2284 2285 // Concatenate all vectors into a wide vector. 2286 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2287 2288 // Interleave the elements in the wide vector. 2289 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); 2290 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, 2291 "interleaved.vec"); 2292 2293 Instruction *NewStoreInstr; 2294 if (IsMaskForCondRequired) { 2295 auto *Undefs = UndefValue::get(Mask[Part]->getType()); 2296 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2297 Value *ShuffledMask = Builder.CreateShuffleVector( 2298 Mask[Part], Undefs, RepMask, "interleaved.mask"); 2299 NewStoreInstr = Builder.CreateMaskedStore( 2300 IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask); 2301 } 2302 else 2303 NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], 2304 Group->getAlignment()); 2305 2306 Group->addMetadata(NewStoreInstr); 2307 } 2308 } 2309 2310 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2311 VectorParts *BlockInMask) { 2312 // Attempt to issue a wide load. 2313 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2314 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2315 2316 assert((LI || SI) && "Invalid Load/Store instruction"); 2317 2318 LoopVectorizationCostModel::InstWidening Decision = 2319 Cost->getWideningDecision(Instr, VF); 2320 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 2321 "CM decision should be taken at this point"); 2322 if (Decision == LoopVectorizationCostModel::CM_Interleave) 2323 return vectorizeInterleaveGroup(Instr); 2324 2325 Type *ScalarDataTy = getMemInstValueType(Instr); 2326 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2327 Value *Ptr = getLoadStorePointerOperand(Instr); 2328 unsigned Alignment = getLoadStoreAlignment(Instr); 2329 // An alignment of 0 means target abi alignment. We need to use the scalar's 2330 // target abi alignment in such a case. 2331 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2332 if (!Alignment) 2333 Alignment = DL.getABITypeAlignment(ScalarDataTy); 2334 unsigned AddressSpace = getLoadStoreAddressSpace(Instr); 2335 2336 // Determine if the pointer operand of the access is either consecutive or 2337 // reverse consecutive. 2338 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2339 bool ConsecutiveStride = 2340 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2341 bool CreateGatherScatter = 2342 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2343 2344 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2345 // gather/scatter. Otherwise Decision should have been to Scalarize. 2346 assert((ConsecutiveStride || CreateGatherScatter) && 2347 "The instruction should be scalarized"); 2348 2349 // Handle consecutive loads/stores. 2350 if (ConsecutiveStride) 2351 Ptr = getOrCreateScalarValue(Ptr, {0, 0}); 2352 2353 VectorParts Mask; 2354 bool isMaskRequired = BlockInMask; 2355 if (isMaskRequired) 2356 Mask = *BlockInMask; 2357 2358 bool InBounds = false; 2359 if (auto *gep = dyn_cast<GetElementPtrInst>( 2360 getLoadStorePointerOperand(Instr)->stripPointerCasts())) 2361 InBounds = gep->isInBounds(); 2362 2363 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2364 // Calculate the pointer for the specific unroll-part. 2365 GetElementPtrInst *PartPtr = nullptr; 2366 2367 if (Reverse) { 2368 // If the address is consecutive but reversed, then the 2369 // wide store needs to start at the last vector element. 2370 PartPtr = cast<GetElementPtrInst>( 2371 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2372 PartPtr->setIsInBounds(InBounds); 2373 PartPtr = cast<GetElementPtrInst>( 2374 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2375 PartPtr->setIsInBounds(InBounds); 2376 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2377 Mask[Part] = reverseVector(Mask[Part]); 2378 } else { 2379 PartPtr = cast<GetElementPtrInst>( 2380 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2381 PartPtr->setIsInBounds(InBounds); 2382 } 2383 2384 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2385 }; 2386 2387 // Handle Stores: 2388 if (SI) { 2389 setDebugLocFromInst(Builder, SI); 2390 2391 for (unsigned Part = 0; Part < UF; ++Part) { 2392 Instruction *NewSI = nullptr; 2393 Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); 2394 if (CreateGatherScatter) { 2395 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; 2396 Value *VectorGep = getOrCreateVectorValue(Ptr, Part); 2397 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2398 MaskPart); 2399 } else { 2400 if (Reverse) { 2401 // If we store to reverse consecutive memory locations, then we need 2402 // to reverse the order of elements in the stored value. 2403 StoredVal = reverseVector(StoredVal); 2404 // We don't want to update the value in the map as it might be used in 2405 // another expression. So don't call resetVectorValue(StoredVal). 2406 } 2407 auto *VecPtr = CreateVecPtr(Part, Ptr); 2408 if (isMaskRequired) 2409 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2410 Mask[Part]); 2411 else 2412 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2413 } 2414 addMetadata(NewSI, SI); 2415 } 2416 return; 2417 } 2418 2419 // Handle loads. 2420 assert(LI && "Must have a load instruction"); 2421 setDebugLocFromInst(Builder, LI); 2422 for (unsigned Part = 0; Part < UF; ++Part) { 2423 Value *NewLI; 2424 if (CreateGatherScatter) { 2425 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; 2426 Value *VectorGep = getOrCreateVectorValue(Ptr, Part); 2427 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2428 nullptr, "wide.masked.gather"); 2429 addMetadata(NewLI, LI); 2430 } else { 2431 auto *VecPtr = CreateVecPtr(Part, Ptr); 2432 if (isMaskRequired) 2433 NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], 2434 UndefValue::get(DataTy), 2435 "wide.masked.load"); 2436 else 2437 NewLI = 2438 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2439 2440 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2441 addMetadata(NewLI, LI); 2442 if (Reverse) 2443 NewLI = reverseVector(NewLI); 2444 } 2445 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2446 } 2447 } 2448 2449 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2450 const VPIteration &Instance, 2451 bool IfPredicateInstr) { 2452 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2453 2454 setDebugLocFromInst(Builder, Instr); 2455 2456 // Does this instruction return a value ? 2457 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2458 2459 Instruction *Cloned = Instr->clone(); 2460 if (!IsVoidRetTy) 2461 Cloned->setName(Instr->getName() + ".cloned"); 2462 2463 // Replace the operands of the cloned instructions with their scalar 2464 // equivalents in the new loop. 2465 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2466 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); 2467 Cloned->setOperand(op, NewOp); 2468 } 2469 addNewMetadata(Cloned, Instr); 2470 2471 // Place the cloned scalar in the new loop. 2472 Builder.Insert(Cloned); 2473 2474 // Add the cloned scalar to the scalar map entry. 2475 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2476 2477 // If we just cloned a new assumption, add it the assumption cache. 2478 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2479 if (II->getIntrinsicID() == Intrinsic::assume) 2480 AC->registerAssumption(II); 2481 2482 // End if-block. 2483 if (IfPredicateInstr) 2484 PredicatedInstructions.push_back(Cloned); 2485 } 2486 2487 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2488 Value *End, Value *Step, 2489 Instruction *DL) { 2490 BasicBlock *Header = L->getHeader(); 2491 BasicBlock *Latch = L->getLoopLatch(); 2492 // As we're just creating this loop, it's possible no latch exists 2493 // yet. If so, use the header as this will be a single block loop. 2494 if (!Latch) 2495 Latch = Header; 2496 2497 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2498 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2499 setDebugLocFromInst(Builder, OldInst); 2500 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2501 2502 Builder.SetInsertPoint(Latch->getTerminator()); 2503 setDebugLocFromInst(Builder, OldInst); 2504 2505 // Create i+1 and fill the PHINode. 2506 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2507 Induction->addIncoming(Start, L->getLoopPreheader()); 2508 Induction->addIncoming(Next, Latch); 2509 // Create the compare. 2510 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2511 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2512 2513 // Now we have two terminators. Remove the old one from the block. 2514 Latch->getTerminator()->eraseFromParent(); 2515 2516 return Induction; 2517 } 2518 2519 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2520 if (TripCount) 2521 return TripCount; 2522 2523 assert(L && "Create Trip Count for null loop."); 2524 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2525 // Find the loop boundaries. 2526 ScalarEvolution *SE = PSE.getSE(); 2527 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2528 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2529 "Invalid loop count"); 2530 2531 Type *IdxTy = Legal->getWidestInductionType(); 2532 assert(IdxTy && "No type for induction"); 2533 2534 // The exit count might have the type of i64 while the phi is i32. This can 2535 // happen if we have an induction variable that is sign extended before the 2536 // compare. The only way that we get a backedge taken count is that the 2537 // induction variable was signed and as such will not overflow. In such a case 2538 // truncation is legal. 2539 if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > 2540 IdxTy->getPrimitiveSizeInBits()) 2541 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2542 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2543 2544 // Get the total trip count from the count by adding 1. 2545 const SCEV *ExitCount = SE->getAddExpr( 2546 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2547 2548 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2549 2550 // Expand the trip count and place the new instructions in the preheader. 2551 // Notice that the pre-header does not change, only the loop body. 2552 SCEVExpander Exp(*SE, DL, "induction"); 2553 2554 // Count holds the overall loop count (N). 2555 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2556 L->getLoopPreheader()->getTerminator()); 2557 2558 if (TripCount->getType()->isPointerTy()) 2559 TripCount = 2560 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2561 L->getLoopPreheader()->getTerminator()); 2562 2563 return TripCount; 2564 } 2565 2566 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2567 if (VectorTripCount) 2568 return VectorTripCount; 2569 2570 Value *TC = getOrCreateTripCount(L); 2571 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2572 2573 Type *Ty = TC->getType(); 2574 Constant *Step = ConstantInt::get(Ty, VF * UF); 2575 2576 // If the tail is to be folded by masking, round the number of iterations N 2577 // up to a multiple of Step instead of rounding down. This is done by first 2578 // adding Step-1 and then rounding down. Note that it's ok if this addition 2579 // overflows: the vector induction variable will eventually wrap to zero given 2580 // that it starts at zero and its Step is a power of two; the loop will then 2581 // exit, with the last early-exit vector comparison also producing all-true. 2582 if (Cost->foldTailByMasking()) { 2583 assert(isPowerOf2_32(VF * UF) && 2584 "VF*UF must be a power of 2 when folding tail by masking"); 2585 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2586 } 2587 2588 // Now we need to generate the expression for the part of the loop that the 2589 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2590 // iterations are not required for correctness, or N - Step, otherwise. Step 2591 // is equal to the vectorization factor (number of SIMD elements) times the 2592 // unroll factor (number of SIMD instructions). 2593 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2594 2595 // If there is a non-reversed interleaved group that may speculatively access 2596 // memory out-of-bounds, we need to ensure that there will be at least one 2597 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2598 // the trip count, we set the remainder to be equal to the step. If the step 2599 // does not evenly divide the trip count, no adjustment is necessary since 2600 // there will already be scalar iterations. Note that the minimum iterations 2601 // check ensures that N >= Step. 2602 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2603 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2604 R = Builder.CreateSelect(IsZero, Step, R); 2605 } 2606 2607 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2608 2609 return VectorTripCount; 2610 } 2611 2612 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2613 const DataLayout &DL) { 2614 // Verify that V is a vector type with same number of elements as DstVTy. 2615 unsigned VF = DstVTy->getNumElements(); 2616 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2617 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2618 Type *SrcElemTy = SrcVecTy->getElementType(); 2619 Type *DstElemTy = DstVTy->getElementType(); 2620 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2621 "Vector elements must have same size"); 2622 2623 // Do a direct cast if element types are castable. 2624 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2625 return Builder.CreateBitOrPointerCast(V, DstVTy); 2626 } 2627 // V cannot be directly casted to desired vector type. 2628 // May happen when V is a floating point vector but DstVTy is a vector of 2629 // pointers or vice-versa. Handle this using a two-step bitcast using an 2630 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2631 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2632 "Only one type should be a pointer type"); 2633 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2634 "Only one type should be a floating point type"); 2635 Type *IntTy = 2636 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2637 VectorType *VecIntTy = VectorType::get(IntTy, VF); 2638 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2639 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2640 } 2641 2642 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2643 BasicBlock *Bypass) { 2644 Value *Count = getOrCreateTripCount(L); 2645 BasicBlock *BB = L->getLoopPreheader(); 2646 IRBuilder<> Builder(BB->getTerminator()); 2647 2648 // Generate code to check if the loop's trip count is less than VF * UF, or 2649 // equal to it in case a scalar epilogue is required; this implies that the 2650 // vector trip count is zero. This check also covers the case where adding one 2651 // to the backedge-taken count overflowed leading to an incorrect trip count 2652 // of zero. In this case we will also jump to the scalar loop. 2653 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2654 : ICmpInst::ICMP_ULT; 2655 2656 // If tail is to be folded, vector loop takes care of all iterations. 2657 Value *CheckMinIters = Builder.getFalse(); 2658 if (!Cost->foldTailByMasking()) 2659 CheckMinIters = Builder.CreateICmp( 2660 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2661 "min.iters.check"); 2662 2663 BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2664 // Update dominator tree immediately if the generated block is a 2665 // LoopBypassBlock because SCEV expansions to generate loop bypass 2666 // checks may query it before the current function is finished. 2667 DT->addNewBlock(NewBB, BB); 2668 if (L->getParentLoop()) 2669 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2670 ReplaceInstWithInst(BB->getTerminator(), 2671 BranchInst::Create(Bypass, NewBB, CheckMinIters)); 2672 LoopBypassBlocks.push_back(BB); 2673 } 2674 2675 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2676 BasicBlock *BB = L->getLoopPreheader(); 2677 2678 // Generate the code to check that the SCEV assumptions that we made. 2679 // We want the new basic block to start at the first instruction in a 2680 // sequence of instructions that form a check. 2681 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2682 "scev.check"); 2683 Value *SCEVCheck = 2684 Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); 2685 2686 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2687 if (C->isZero()) 2688 return; 2689 2690 assert(!Cost->foldTailByMasking() && 2691 "Cannot SCEV check stride or overflow when folding tail"); 2692 // Create a new block containing the stride check. 2693 BB->setName("vector.scevcheck"); 2694 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2695 // Update dominator tree immediately if the generated block is a 2696 // LoopBypassBlock because SCEV expansions to generate loop bypass 2697 // checks may query it before the current function is finished. 2698 DT->addNewBlock(NewBB, BB); 2699 if (L->getParentLoop()) 2700 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2701 ReplaceInstWithInst(BB->getTerminator(), 2702 BranchInst::Create(Bypass, NewBB, SCEVCheck)); 2703 LoopBypassBlocks.push_back(BB); 2704 AddedSafetyChecks = true; 2705 } 2706 2707 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2708 // VPlan-native path does not do any analysis for runtime checks currently. 2709 if (EnableVPlanNativePath) 2710 return; 2711 2712 BasicBlock *BB = L->getLoopPreheader(); 2713 2714 // Generate the code that checks in runtime if arrays overlap. We put the 2715 // checks into a separate block to make the more common case of few elements 2716 // faster. 2717 Instruction *FirstCheckInst; 2718 Instruction *MemRuntimeCheck; 2719 std::tie(FirstCheckInst, MemRuntimeCheck) = 2720 Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); 2721 if (!MemRuntimeCheck) 2722 return; 2723 2724 assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail"); 2725 // Create a new block containing the memory check. 2726 BB->setName("vector.memcheck"); 2727 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2728 // Update dominator tree immediately if the generated block is a 2729 // LoopBypassBlock because SCEV expansions to generate loop bypass 2730 // checks may query it before the current function is finished. 2731 DT->addNewBlock(NewBB, BB); 2732 if (L->getParentLoop()) 2733 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2734 ReplaceInstWithInst(BB->getTerminator(), 2735 BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); 2736 LoopBypassBlocks.push_back(BB); 2737 AddedSafetyChecks = true; 2738 2739 // We currently don't use LoopVersioning for the actual loop cloning but we 2740 // still use it to add the noalias metadata. 2741 LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2742 PSE.getSE()); 2743 LVer->prepareNoAliasMetadata(); 2744 } 2745 2746 Value *InnerLoopVectorizer::emitTransformedIndex( 2747 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2748 const InductionDescriptor &ID) const { 2749 2750 SCEVExpander Exp(*SE, DL, "induction"); 2751 auto Step = ID.getStep(); 2752 auto StartValue = ID.getStartValue(); 2753 assert(Index->getType() == Step->getType() && 2754 "Index type does not match StepValue type"); 2755 2756 // Note: the IR at this point is broken. We cannot use SE to create any new 2757 // SCEV and then expand it, hoping that SCEV's simplification will give us 2758 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2759 // lead to various SCEV crashes. So all we can do is to use builder and rely 2760 // on InstCombine for future simplifications. Here we handle some trivial 2761 // cases only. 2762 auto CreateAdd = [&B](Value *X, Value *Y) { 2763 assert(X->getType() == Y->getType() && "Types don't match!"); 2764 if (auto *CX = dyn_cast<ConstantInt>(X)) 2765 if (CX->isZero()) 2766 return Y; 2767 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2768 if (CY->isZero()) 2769 return X; 2770 return B.CreateAdd(X, Y); 2771 }; 2772 2773 auto CreateMul = [&B](Value *X, Value *Y) { 2774 assert(X->getType() == Y->getType() && "Types don't match!"); 2775 if (auto *CX = dyn_cast<ConstantInt>(X)) 2776 if (CX->isOne()) 2777 return Y; 2778 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2779 if (CY->isOne()) 2780 return X; 2781 return B.CreateMul(X, Y); 2782 }; 2783 2784 switch (ID.getKind()) { 2785 case InductionDescriptor::IK_IntInduction: { 2786 assert(Index->getType() == StartValue->getType() && 2787 "Index type does not match StartValue type"); 2788 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2789 return B.CreateSub(StartValue, Index); 2790 auto *Offset = CreateMul( 2791 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2792 return CreateAdd(StartValue, Offset); 2793 } 2794 case InductionDescriptor::IK_PtrInduction: { 2795 assert(isa<SCEVConstant>(Step) && 2796 "Expected constant step for pointer induction"); 2797 return B.CreateGEP( 2798 StartValue->getType()->getPointerElementType(), StartValue, 2799 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2800 &*B.GetInsertPoint()))); 2801 } 2802 case InductionDescriptor::IK_FpInduction: { 2803 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2804 auto InductionBinOp = ID.getInductionBinOp(); 2805 assert(InductionBinOp && 2806 (InductionBinOp->getOpcode() == Instruction::FAdd || 2807 InductionBinOp->getOpcode() == Instruction::FSub) && 2808 "Original bin op should be defined for FP induction"); 2809 2810 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2811 2812 // Floating point operations had to be 'fast' to enable the induction. 2813 FastMathFlags Flags; 2814 Flags.setFast(); 2815 2816 Value *MulExp = B.CreateFMul(StepValue, Index); 2817 if (isa<Instruction>(MulExp)) 2818 // We have to check, the MulExp may be a constant. 2819 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2820 2821 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2822 "induction"); 2823 if (isa<Instruction>(BOp)) 2824 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2825 2826 return BOp; 2827 } 2828 case InductionDescriptor::IK_NoInduction: 2829 return nullptr; 2830 } 2831 llvm_unreachable("invalid enum"); 2832 } 2833 2834 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2835 /* 2836 In this function we generate a new loop. The new loop will contain 2837 the vectorized instructions while the old loop will continue to run the 2838 scalar remainder. 2839 2840 [ ] <-- loop iteration number check. 2841 / | 2842 / v 2843 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2844 | / | 2845 | / v 2846 || [ ] <-- vector pre header. 2847 |/ | 2848 | v 2849 | [ ] \ 2850 | [ ]_| <-- vector loop. 2851 | | 2852 | v 2853 | -[ ] <--- middle-block. 2854 | / | 2855 | / v 2856 -|- >[ ] <--- new preheader. 2857 | | 2858 | v 2859 | [ ] \ 2860 | [ ]_| <-- old scalar loop to handle remainder. 2861 \ | 2862 \ v 2863 >[ ] <-- exit block. 2864 ... 2865 */ 2866 2867 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 2868 BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); 2869 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 2870 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2871 assert(VectorPH && "Invalid loop structure"); 2872 assert(ExitBlock && "Must have an exit block"); 2873 2874 // Some loops have a single integer induction variable, while other loops 2875 // don't. One example is c++ iterators that often have multiple pointer 2876 // induction variables. In the code below we also support a case where we 2877 // don't have a single induction variable. 2878 // 2879 // We try to obtain an induction variable from the original loop as hard 2880 // as possible. However if we don't find one that: 2881 // - is an integer 2882 // - counts from zero, stepping by one 2883 // - is the size of the widest induction variable type 2884 // then we create a new one. 2885 OldInduction = Legal->getPrimaryInduction(); 2886 Type *IdxTy = Legal->getWidestInductionType(); 2887 2888 // Split the single block loop into the two loop structure described above. 2889 BasicBlock *VecBody = 2890 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 2891 BasicBlock *MiddleBlock = 2892 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 2893 BasicBlock *ScalarPH = 2894 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 2895 2896 // Create and register the new vector loop. 2897 Loop *Lp = LI->AllocateLoop(); 2898 Loop *ParentLoop = OrigLoop->getParentLoop(); 2899 2900 // Insert the new loop into the loop nest and register the new basic blocks 2901 // before calling any utilities such as SCEV that require valid LoopInfo. 2902 if (ParentLoop) { 2903 ParentLoop->addChildLoop(Lp); 2904 ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); 2905 ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); 2906 } else { 2907 LI->addTopLevelLoop(Lp); 2908 } 2909 Lp->addBasicBlockToLoop(VecBody, *LI); 2910 2911 // Find the loop boundaries. 2912 Value *Count = getOrCreateTripCount(Lp); 2913 2914 Value *StartIdx = ConstantInt::get(IdxTy, 0); 2915 2916 // Now, compare the new count to zero. If it is zero skip the vector loop and 2917 // jump to the scalar loop. This check also covers the case where the 2918 // backedge-taken count is uint##_max: adding one to it will overflow leading 2919 // to an incorrect trip count of zero. In this (rare) case we will also jump 2920 // to the scalar loop. 2921 emitMinimumIterationCountCheck(Lp, ScalarPH); 2922 2923 // Generate the code to check any assumptions that we've made for SCEV 2924 // expressions. 2925 emitSCEVChecks(Lp, ScalarPH); 2926 2927 // Generate the code that checks in runtime if arrays overlap. We put the 2928 // checks into a separate block to make the more common case of few elements 2929 // faster. 2930 emitMemRuntimeChecks(Lp, ScalarPH); 2931 2932 // Generate the induction variable. 2933 // The loop step is equal to the vectorization factor (num of SIMD elements) 2934 // times the unroll factor (num of SIMD instructions). 2935 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 2936 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 2937 Induction = 2938 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 2939 getDebugLocFromInstOrOperands(OldInduction)); 2940 2941 // We are going to resume the execution of the scalar loop. 2942 // Go over all of the induction variables that we found and fix the 2943 // PHIs that are left in the scalar version of the loop. 2944 // The starting values of PHI nodes depend on the counter of the last 2945 // iteration in the vectorized loop. 2946 // If we come from a bypass edge then we need to start from the original 2947 // start value. 2948 2949 // This variable saves the new starting index for the scalar loop. It is used 2950 // to test if there are any tail iterations left once the vector loop has 2951 // completed. 2952 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 2953 for (auto &InductionEntry : *List) { 2954 PHINode *OrigPhi = InductionEntry.first; 2955 InductionDescriptor II = InductionEntry.second; 2956 2957 // Create phi nodes to merge from the backedge-taken check block. 2958 PHINode *BCResumeVal = PHINode::Create( 2959 OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); 2960 // Copy original phi DL over to the new one. 2961 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 2962 Value *&EndValue = IVEndValues[OrigPhi]; 2963 if (OrigPhi == OldInduction) { 2964 // We know what the end value is. 2965 EndValue = CountRoundDown; 2966 } else { 2967 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 2968 Type *StepType = II.getStep()->getType(); 2969 Instruction::CastOps CastOp = 2970 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 2971 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 2972 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2973 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 2974 EndValue->setName("ind.end"); 2975 } 2976 2977 // The new PHI merges the original incoming value, in case of a bypass, 2978 // or the value at the end of the vectorized loop. 2979 BCResumeVal->addIncoming(EndValue, MiddleBlock); 2980 2981 // Fix the scalar body counter (PHI node). 2982 // The old induction's phi node in the scalar body needs the truncated 2983 // value. 2984 for (BasicBlock *BB : LoopBypassBlocks) 2985 BCResumeVal->addIncoming(II.getStartValue(), BB); 2986 OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal); 2987 } 2988 2989 // We need the OrigLoop (scalar loop part) latch terminator to help 2990 // produce correct debug info for the middle block BB instructions. 2991 // The legality check stage guarantees that the loop will have a single 2992 // latch. 2993 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 2994 "Scalar loop latch terminator isn't a branch"); 2995 BranchInst *ScalarLatchBr = 2996 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 2997 2998 // Add a check in the middle block to see if we have completed 2999 // all of the iterations in the first vector loop. 3000 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3001 // If tail is to be folded, we know we don't need to run the remainder. 3002 Value *CmpN = Builder.getTrue(); 3003 if (!Cost->foldTailByMasking()) { 3004 CmpN = 3005 CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3006 CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); 3007 3008 // Here we use the same DebugLoc as the scalar loop latch branch instead 3009 // of the corresponding compare because they may have ended up with 3010 // different line numbers and we want to avoid awkward line stepping while 3011 // debugging. Eg. if the compare has got a line number inside the loop. 3012 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3013 } 3014 3015 BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN); 3016 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3017 ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst); 3018 3019 // Get ready to start creating new instructions into the vectorized body. 3020 Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); 3021 3022 // Save the state. 3023 LoopVectorPreHeader = Lp->getLoopPreheader(); 3024 LoopScalarPreHeader = ScalarPH; 3025 LoopMiddleBlock = MiddleBlock; 3026 LoopExitBlock = ExitBlock; 3027 LoopVectorBody = VecBody; 3028 LoopScalarBody = OldBasicBlock; 3029 3030 Optional<MDNode *> VectorizedLoopID = 3031 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3032 LLVMLoopVectorizeFollowupVectorized}); 3033 if (VectorizedLoopID.hasValue()) { 3034 Lp->setLoopID(VectorizedLoopID.getValue()); 3035 3036 // Do not setAlreadyVectorized if loop attributes have been defined 3037 // explicitly. 3038 return LoopVectorPreHeader; 3039 } 3040 3041 // Keep all loop hints from the original loop on the vector loop (we'll 3042 // replace the vectorizer-specific hints below). 3043 if (MDNode *LID = OrigLoop->getLoopID()) 3044 Lp->setLoopID(LID); 3045 3046 LoopVectorizeHints Hints(Lp, true, *ORE); 3047 Hints.setAlreadyVectorized(); 3048 3049 return LoopVectorPreHeader; 3050 } 3051 3052 // Fix up external users of the induction variable. At this point, we are 3053 // in LCSSA form, with all external PHIs that use the IV having one input value, 3054 // coming from the remainder loop. We need those PHIs to also have a correct 3055 // value for the IV when arriving directly from the middle block. 3056 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3057 const InductionDescriptor &II, 3058 Value *CountRoundDown, Value *EndValue, 3059 BasicBlock *MiddleBlock) { 3060 // There are two kinds of external IV usages - those that use the value 3061 // computed in the last iteration (the PHI) and those that use the penultimate 3062 // value (the value that feeds into the phi from the loop latch). 3063 // We allow both, but they, obviously, have different values. 3064 3065 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3066 3067 DenseMap<Value *, Value *> MissingVals; 3068 3069 // An external user of the last iteration's value should see the value that 3070 // the remainder loop uses to initialize its own IV. 3071 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3072 for (User *U : PostInc->users()) { 3073 Instruction *UI = cast<Instruction>(U); 3074 if (!OrigLoop->contains(UI)) { 3075 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3076 MissingVals[UI] = EndValue; 3077 } 3078 } 3079 3080 // An external user of the penultimate value need to see EndValue - Step. 3081 // The simplest way to get this is to recompute it from the constituent SCEVs, 3082 // that is Start + (Step * (CRD - 1)). 3083 for (User *U : OrigPhi->users()) { 3084 auto *UI = cast<Instruction>(U); 3085 if (!OrigLoop->contains(UI)) { 3086 const DataLayout &DL = 3087 OrigLoop->getHeader()->getModule()->getDataLayout(); 3088 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3089 3090 IRBuilder<> B(MiddleBlock->getTerminator()); 3091 Value *CountMinusOne = B.CreateSub( 3092 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3093 Value *CMO = 3094 !II.getStep()->getType()->isIntegerTy() 3095 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3096 II.getStep()->getType()) 3097 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3098 CMO->setName("cast.cmo"); 3099 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3100 Escape->setName("ind.escape"); 3101 MissingVals[UI] = Escape; 3102 } 3103 } 3104 3105 for (auto &I : MissingVals) { 3106 PHINode *PHI = cast<PHINode>(I.first); 3107 // One corner case we have to handle is two IVs "chasing" each-other, 3108 // that is %IV2 = phi [...], [ %IV1, %latch ] 3109 // In this case, if IV1 has an external use, we need to avoid adding both 3110 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3111 // don't already have an incoming value for the middle block. 3112 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3113 PHI->addIncoming(I.second, MiddleBlock); 3114 } 3115 } 3116 3117 namespace { 3118 3119 struct CSEDenseMapInfo { 3120 static bool canHandle(const Instruction *I) { 3121 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3122 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3123 } 3124 3125 static inline Instruction *getEmptyKey() { 3126 return DenseMapInfo<Instruction *>::getEmptyKey(); 3127 } 3128 3129 static inline Instruction *getTombstoneKey() { 3130 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3131 } 3132 3133 static unsigned getHashValue(const Instruction *I) { 3134 assert(canHandle(I) && "Unknown instruction!"); 3135 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3136 I->value_op_end())); 3137 } 3138 3139 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3140 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3141 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3142 return LHS == RHS; 3143 return LHS->isIdenticalTo(RHS); 3144 } 3145 }; 3146 3147 } // end anonymous namespace 3148 3149 ///Perform cse of induction variable instructions. 3150 static void cse(BasicBlock *BB) { 3151 // Perform simple cse. 3152 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3153 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3154 Instruction *In = &*I++; 3155 3156 if (!CSEDenseMapInfo::canHandle(In)) 3157 continue; 3158 3159 // Check if we can replace this instruction with any of the 3160 // visited instructions. 3161 if (Instruction *V = CSEMap.lookup(In)) { 3162 In->replaceAllUsesWith(V); 3163 In->eraseFromParent(); 3164 continue; 3165 } 3166 3167 CSEMap[In] = In; 3168 } 3169 } 3170 3171 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3172 unsigned VF, 3173 bool &NeedToScalarize) { 3174 Function *F = CI->getCalledFunction(); 3175 StringRef FnName = CI->getCalledFunction()->getName(); 3176 Type *ScalarRetTy = CI->getType(); 3177 SmallVector<Type *, 4> Tys, ScalarTys; 3178 for (auto &ArgOp : CI->arg_operands()) 3179 ScalarTys.push_back(ArgOp->getType()); 3180 3181 // Estimate cost of scalarized vector call. The source operands are assumed 3182 // to be vectors, so we need to extract individual elements from there, 3183 // execute VF scalar calls, and then gather the result into the vector return 3184 // value. 3185 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3186 if (VF == 1) 3187 return ScalarCallCost; 3188 3189 // Compute corresponding vector type for return value and arguments. 3190 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3191 for (Type *ScalarTy : ScalarTys) 3192 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3193 3194 // Compute costs of unpacking argument values for the scalar calls and 3195 // packing the return values to a vector. 3196 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3197 3198 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3199 3200 // If we can't emit a vector call for this function, then the currently found 3201 // cost is the cost we need to return. 3202 NeedToScalarize = true; 3203 if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) 3204 return Cost; 3205 3206 // If the corresponding vector cost is cheaper, return its cost. 3207 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3208 if (VectorCallCost < Cost) { 3209 NeedToScalarize = false; 3210 return VectorCallCost; 3211 } 3212 return Cost; 3213 } 3214 3215 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3216 unsigned VF) { 3217 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3218 assert(ID && "Expected intrinsic call!"); 3219 3220 FastMathFlags FMF; 3221 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3222 FMF = FPMO->getFastMathFlags(); 3223 3224 SmallVector<Value *, 4> Operands(CI->arg_operands()); 3225 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF); 3226 } 3227 3228 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3229 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3230 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3231 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3232 } 3233 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3234 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3235 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3236 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3237 } 3238 3239 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3240 // For every instruction `I` in MinBWs, truncate the operands, create a 3241 // truncated version of `I` and reextend its result. InstCombine runs 3242 // later and will remove any ext/trunc pairs. 3243 SmallPtrSet<Value *, 4> Erased; 3244 for (const auto &KV : Cost->getMinimalBitwidths()) { 3245 // If the value wasn't vectorized, we must maintain the original scalar 3246 // type. The absence of the value from VectorLoopValueMap indicates that it 3247 // wasn't vectorized. 3248 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3249 continue; 3250 for (unsigned Part = 0; Part < UF; ++Part) { 3251 Value *I = getOrCreateVectorValue(KV.first, Part); 3252 if (Erased.find(I) != Erased.end() || I->use_empty() || 3253 !isa<Instruction>(I)) 3254 continue; 3255 Type *OriginalTy = I->getType(); 3256 Type *ScalarTruncatedTy = 3257 IntegerType::get(OriginalTy->getContext(), KV.second); 3258 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, 3259 OriginalTy->getVectorNumElements()); 3260 if (TruncatedTy == OriginalTy) 3261 continue; 3262 3263 IRBuilder<> B(cast<Instruction>(I)); 3264 auto ShrinkOperand = [&](Value *V) -> Value * { 3265 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3266 if (ZI->getSrcTy() == TruncatedTy) 3267 return ZI->getOperand(0); 3268 return B.CreateZExtOrTrunc(V, TruncatedTy); 3269 }; 3270 3271 // The actual instruction modification depends on the instruction type, 3272 // unfortunately. 3273 Value *NewI = nullptr; 3274 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3275 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3276 ShrinkOperand(BO->getOperand(1))); 3277 3278 // Any wrapping introduced by shrinking this operation shouldn't be 3279 // considered undefined behavior. So, we can't unconditionally copy 3280 // arithmetic wrapping flags to NewI. 3281 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3282 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3283 NewI = 3284 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3285 ShrinkOperand(CI->getOperand(1))); 3286 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3287 NewI = B.CreateSelect(SI->getCondition(), 3288 ShrinkOperand(SI->getTrueValue()), 3289 ShrinkOperand(SI->getFalseValue())); 3290 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3291 switch (CI->getOpcode()) { 3292 default: 3293 llvm_unreachable("Unhandled cast!"); 3294 case Instruction::Trunc: 3295 NewI = ShrinkOperand(CI->getOperand(0)); 3296 break; 3297 case Instruction::SExt: 3298 NewI = B.CreateSExtOrTrunc( 3299 CI->getOperand(0), 3300 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3301 break; 3302 case Instruction::ZExt: 3303 NewI = B.CreateZExtOrTrunc( 3304 CI->getOperand(0), 3305 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3306 break; 3307 } 3308 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3309 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); 3310 auto *O0 = B.CreateZExtOrTrunc( 3311 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3312 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); 3313 auto *O1 = B.CreateZExtOrTrunc( 3314 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3315 3316 NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); 3317 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3318 // Don't do anything with the operands, just extend the result. 3319 continue; 3320 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3321 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements(); 3322 auto *O0 = B.CreateZExtOrTrunc( 3323 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3324 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3325 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3326 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3327 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements(); 3328 auto *O0 = B.CreateZExtOrTrunc( 3329 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3330 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3331 } else { 3332 // If we don't know what to do, be conservative and don't do anything. 3333 continue; 3334 } 3335 3336 // Lastly, extend the result. 3337 NewI->takeName(cast<Instruction>(I)); 3338 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3339 I->replaceAllUsesWith(Res); 3340 cast<Instruction>(I)->eraseFromParent(); 3341 Erased.insert(I); 3342 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3343 } 3344 } 3345 3346 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3347 for (const auto &KV : Cost->getMinimalBitwidths()) { 3348 // If the value wasn't vectorized, we must maintain the original scalar 3349 // type. The absence of the value from VectorLoopValueMap indicates that it 3350 // wasn't vectorized. 3351 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3352 continue; 3353 for (unsigned Part = 0; Part < UF; ++Part) { 3354 Value *I = getOrCreateVectorValue(KV.first, Part); 3355 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3356 if (Inst && Inst->use_empty()) { 3357 Value *NewI = Inst->getOperand(0); 3358 Inst->eraseFromParent(); 3359 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3360 } 3361 } 3362 } 3363 } 3364 3365 void InnerLoopVectorizer::fixVectorizedLoop() { 3366 // Insert truncates and extends for any truncated instructions as hints to 3367 // InstCombine. 3368 if (VF > 1) 3369 truncateToMinimalBitwidths(); 3370 3371 // Fix widened non-induction PHIs by setting up the PHI operands. 3372 if (OrigPHIsToFix.size()) { 3373 assert(EnableVPlanNativePath && 3374 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3375 fixNonInductionPHIs(); 3376 } 3377 3378 // At this point every instruction in the original loop is widened to a 3379 // vector form. Now we need to fix the recurrences in the loop. These PHI 3380 // nodes are currently empty because we did not want to introduce cycles. 3381 // This is the second stage of vectorizing recurrences. 3382 fixCrossIterationPHIs(); 3383 3384 // Update the dominator tree. 3385 // 3386 // FIXME: After creating the structure of the new loop, the dominator tree is 3387 // no longer up-to-date, and it remains that way until we update it 3388 // here. An out-of-date dominator tree is problematic for SCEV, 3389 // because SCEVExpander uses it to guide code generation. The 3390 // vectorizer use SCEVExpanders in several places. Instead, we should 3391 // keep the dominator tree up-to-date as we go. 3392 updateAnalysis(); 3393 3394 // Fix-up external users of the induction variables. 3395 for (auto &Entry : *Legal->getInductionVars()) 3396 fixupIVUsers(Entry.first, Entry.second, 3397 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3398 IVEndValues[Entry.first], LoopMiddleBlock); 3399 3400 fixLCSSAPHIs(); 3401 for (Instruction *PI : PredicatedInstructions) 3402 sinkScalarOperands(&*PI); 3403 3404 // Remove redundant induction instructions. 3405 cse(LoopVectorBody); 3406 } 3407 3408 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3409 // In order to support recurrences we need to be able to vectorize Phi nodes. 3410 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3411 // stage #2: We now need to fix the recurrences by adding incoming edges to 3412 // the currently empty PHI nodes. At this point every instruction in the 3413 // original loop is widened to a vector form so we can use them to construct 3414 // the incoming edges. 3415 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3416 // Handle first-order recurrences and reductions that need to be fixed. 3417 if (Legal->isFirstOrderRecurrence(&Phi)) 3418 fixFirstOrderRecurrence(&Phi); 3419 else if (Legal->isReductionVariable(&Phi)) 3420 fixReduction(&Phi); 3421 } 3422 } 3423 3424 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3425 // This is the second phase of vectorizing first-order recurrences. An 3426 // overview of the transformation is described below. Suppose we have the 3427 // following loop. 3428 // 3429 // for (int i = 0; i < n; ++i) 3430 // b[i] = a[i] - a[i - 1]; 3431 // 3432 // There is a first-order recurrence on "a". For this loop, the shorthand 3433 // scalar IR looks like: 3434 // 3435 // scalar.ph: 3436 // s_init = a[-1] 3437 // br scalar.body 3438 // 3439 // scalar.body: 3440 // i = phi [0, scalar.ph], [i+1, scalar.body] 3441 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3442 // s2 = a[i] 3443 // b[i] = s2 - s1 3444 // br cond, scalar.body, ... 3445 // 3446 // In this example, s1 is a recurrence because it's value depends on the 3447 // previous iteration. In the first phase of vectorization, we created a 3448 // temporary value for s1. We now complete the vectorization and produce the 3449 // shorthand vector IR shown below (for VF = 4, UF = 1). 3450 // 3451 // vector.ph: 3452 // v_init = vector(..., ..., ..., a[-1]) 3453 // br vector.body 3454 // 3455 // vector.body 3456 // i = phi [0, vector.ph], [i+4, vector.body] 3457 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3458 // v2 = a[i, i+1, i+2, i+3]; 3459 // v3 = vector(v1(3), v2(0, 1, 2)) 3460 // b[i, i+1, i+2, i+3] = v2 - v3 3461 // br cond, vector.body, middle.block 3462 // 3463 // middle.block: 3464 // x = v2(3) 3465 // br scalar.ph 3466 // 3467 // scalar.ph: 3468 // s_init = phi [x, middle.block], [a[-1], otherwise] 3469 // br scalar.body 3470 // 3471 // After execution completes the vector loop, we extract the next value of 3472 // the recurrence (x) to use as the initial value in the scalar loop. 3473 3474 // Get the original loop preheader and single loop latch. 3475 auto *Preheader = OrigLoop->getLoopPreheader(); 3476 auto *Latch = OrigLoop->getLoopLatch(); 3477 3478 // Get the initial and previous values of the scalar recurrence. 3479 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3480 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3481 3482 // Create a vector from the initial value. 3483 auto *VectorInit = ScalarInit; 3484 if (VF > 1) { 3485 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3486 VectorInit = Builder.CreateInsertElement( 3487 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3488 Builder.getInt32(VF - 1), "vector.recur.init"); 3489 } 3490 3491 // We constructed a temporary phi node in the first phase of vectorization. 3492 // This phi node will eventually be deleted. 3493 Builder.SetInsertPoint( 3494 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3495 3496 // Create a phi node for the new recurrence. The current value will either be 3497 // the initial value inserted into a vector or loop-varying vector value. 3498 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3499 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3500 3501 // Get the vectorized previous value of the last part UF - 1. It appears last 3502 // among all unrolled iterations, due to the order of their construction. 3503 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3504 3505 // Set the insertion point after the previous value if it is an instruction. 3506 // Note that the previous value may have been constant-folded so it is not 3507 // guaranteed to be an instruction in the vector loop. Also, if the previous 3508 // value is a phi node, we should insert after all the phi nodes to avoid 3509 // breaking basic block verification. 3510 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) || 3511 isa<PHINode>(PreviousLastPart)) 3512 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3513 else 3514 Builder.SetInsertPoint( 3515 &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart))); 3516 3517 // We will construct a vector for the recurrence by combining the values for 3518 // the current and previous iterations. This is the required shuffle mask. 3519 SmallVector<Constant *, 8> ShuffleMask(VF); 3520 ShuffleMask[0] = Builder.getInt32(VF - 1); 3521 for (unsigned I = 1; I < VF; ++I) 3522 ShuffleMask[I] = Builder.getInt32(I + VF - 1); 3523 3524 // The vector from which to take the initial value for the current iteration 3525 // (actual or unrolled). Initially, this is the vector phi node. 3526 Value *Incoming = VecPhi; 3527 3528 // Shuffle the current and previous vector and update the vector parts. 3529 for (unsigned Part = 0; Part < UF; ++Part) { 3530 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3531 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3532 auto *Shuffle = 3533 VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3534 ConstantVector::get(ShuffleMask)) 3535 : Incoming; 3536 PhiPart->replaceAllUsesWith(Shuffle); 3537 cast<Instruction>(PhiPart)->eraseFromParent(); 3538 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3539 Incoming = PreviousPart; 3540 } 3541 3542 // Fix the latch value of the new recurrence in the vector loop. 3543 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3544 3545 // Extract the last vector element in the middle block. This will be the 3546 // initial value for the recurrence when jumping to the scalar loop. 3547 auto *ExtractForScalar = Incoming; 3548 if (VF > 1) { 3549 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3550 ExtractForScalar = Builder.CreateExtractElement( 3551 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3552 } 3553 // Extract the second last element in the middle block if the 3554 // Phi is used outside the loop. We need to extract the phi itself 3555 // and not the last element (the phi update in the current iteration). This 3556 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3557 // when the scalar loop is not run at all. 3558 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3559 if (VF > 1) 3560 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3561 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3562 // When loop is unrolled without vectorizing, initialize 3563 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3564 // `Incoming`. This is analogous to the vectorized case above: extracting the 3565 // second last element when VF > 1. 3566 else if (UF > 1) 3567 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3568 3569 // Fix the initial value of the original recurrence in the scalar loop. 3570 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3571 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3572 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3573 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3574 Start->addIncoming(Incoming, BB); 3575 } 3576 3577 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3578 Phi->setName("scalar.recur"); 3579 3580 // Finally, fix users of the recurrence outside the loop. The users will need 3581 // either the last value of the scalar recurrence or the last value of the 3582 // vector recurrence we extracted in the middle block. Since the loop is in 3583 // LCSSA form, we just need to find all the phi nodes for the original scalar 3584 // recurrence in the exit block, and then add an edge for the middle block. 3585 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3586 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3587 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3588 } 3589 } 3590 } 3591 3592 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3593 Constant *Zero = Builder.getInt32(0); 3594 3595 // Get it's reduction variable descriptor. 3596 assert(Legal->isReductionVariable(Phi) && 3597 "Unable to find the reduction variable"); 3598 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; 3599 3600 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3601 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3602 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3603 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3604 RdxDesc.getMinMaxRecurrenceKind(); 3605 setDebugLocFromInst(Builder, ReductionStartValue); 3606 3607 // We need to generate a reduction vector from the incoming scalar. 3608 // To do so, we need to generate the 'identity' vector and override 3609 // one of the elements with the incoming scalar reduction. We need 3610 // to do it in the vector-loop preheader. 3611 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3612 3613 // This is the vector-clone of the value that leaves the loop. 3614 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3615 3616 // Find the reduction identity variable. Zero for addition, or, xor, 3617 // one for multiplication, -1 for And. 3618 Value *Identity; 3619 Value *VectorStart; 3620 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3621 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3622 // MinMax reduction have the start value as their identify. 3623 if (VF == 1) { 3624 VectorStart = Identity = ReductionStartValue; 3625 } else { 3626 VectorStart = Identity = 3627 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3628 } 3629 } else { 3630 // Handle other reduction kinds: 3631 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3632 RK, VecTy->getScalarType()); 3633 if (VF == 1) { 3634 Identity = Iden; 3635 // This vector is the Identity vector where the first element is the 3636 // incoming scalar reduction. 3637 VectorStart = ReductionStartValue; 3638 } else { 3639 Identity = ConstantVector::getSplat(VF, Iden); 3640 3641 // This vector is the Identity vector where the first element is the 3642 // incoming scalar reduction. 3643 VectorStart = 3644 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3645 } 3646 } 3647 3648 // Fix the vector-loop phi. 3649 3650 // Reductions do not have to start at zero. They can start with 3651 // any loop invariant values. 3652 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3653 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3654 for (unsigned Part = 0; Part < UF; ++Part) { 3655 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3656 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3657 // Make sure to add the reduction stat value only to the 3658 // first unroll part. 3659 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3660 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3661 cast<PHINode>(VecRdxPhi) 3662 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3663 } 3664 3665 // Before each round, move the insertion point right between 3666 // the PHIs and the values we are going to write. 3667 // This allows us to write both PHINodes and the extractelement 3668 // instructions. 3669 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3670 3671 setDebugLocFromInst(Builder, LoopExitInst); 3672 3673 // If the vector reduction can be performed in a smaller type, we truncate 3674 // then extend the loop exit value to enable InstCombine to evaluate the 3675 // entire expression in the smaller type. 3676 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3677 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3678 Builder.SetInsertPoint( 3679 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3680 VectorParts RdxParts(UF); 3681 for (unsigned Part = 0; Part < UF; ++Part) { 3682 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3683 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3684 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3685 : Builder.CreateZExt(Trunc, VecTy); 3686 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3687 UI != RdxParts[Part]->user_end();) 3688 if (*UI != Trunc) { 3689 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3690 RdxParts[Part] = Extnd; 3691 } else { 3692 ++UI; 3693 } 3694 } 3695 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3696 for (unsigned Part = 0; Part < UF; ++Part) { 3697 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3698 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3699 } 3700 } 3701 3702 // Reduce all of the unrolled parts into a single vector. 3703 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3704 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3705 3706 // The middle block terminator has already been assigned a DebugLoc here (the 3707 // OrigLoop's single latch terminator). We want the whole middle block to 3708 // appear to execute on this line because: (a) it is all compiler generated, 3709 // (b) these instructions are always executed after evaluating the latch 3710 // conditional branch, and (c) other passes may add new predecessors which 3711 // terminate on this line. This is the easiest way to ensure we don't 3712 // accidentally cause an extra step back into the loop while debugging. 3713 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3714 for (unsigned Part = 1; Part < UF; ++Part) { 3715 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3716 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3717 // Floating point operations had to be 'fast' to enable the reduction. 3718 ReducedPartRdx = addFastMathFlag( 3719 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3720 ReducedPartRdx, "bin.rdx"), 3721 RdxDesc.getFastMathFlags()); 3722 else 3723 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3724 RdxPart); 3725 } 3726 3727 if (VF > 1) { 3728 bool NoNaN = Legal->hasFunNoNaNAttr(); 3729 ReducedPartRdx = 3730 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3731 // If the reduction can be performed in a smaller type, we need to extend 3732 // the reduction to the wider type before we branch to the original loop. 3733 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3734 ReducedPartRdx = 3735 RdxDesc.isSigned() 3736 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3737 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3738 } 3739 3740 // Create a phi node that merges control-flow from the backedge-taken check 3741 // block and the middle block. 3742 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3743 LoopScalarPreHeader->getTerminator()); 3744 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3745 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3746 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3747 3748 // Now, we need to fix the users of the reduction variable 3749 // inside and outside of the scalar remainder loop. 3750 // We know that the loop is in LCSSA form. We need to update the 3751 // PHI nodes in the exit blocks. 3752 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3753 // All PHINodes need to have a single entry edge, or two if 3754 // we already fixed them. 3755 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3756 3757 // We found a reduction value exit-PHI. Update it with the 3758 // incoming bypass edge. 3759 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3760 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3761 } // end of the LCSSA phi scan. 3762 3763 // Fix the scalar loop reduction variable with the incoming reduction sum 3764 // from the vector body and from the backedge value. 3765 int IncomingEdgeBlockIdx = 3766 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3767 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3768 // Pick the other block. 3769 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3770 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3771 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3772 } 3773 3774 void InnerLoopVectorizer::fixLCSSAPHIs() { 3775 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3776 if (LCSSAPhi.getNumIncomingValues() == 1) { 3777 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3778 // Non-instruction incoming values will have only one value. 3779 unsigned LastLane = 0; 3780 if (isa<Instruction>(IncomingValue)) 3781 LastLane = Cost->isUniformAfterVectorization( 3782 cast<Instruction>(IncomingValue), VF) 3783 ? 0 3784 : VF - 1; 3785 // Can be a loop invariant incoming value or the last scalar value to be 3786 // extracted from the vectorized loop. 3787 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3788 Value *lastIncomingValue = 3789 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3790 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3791 } 3792 } 3793 } 3794 3795 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3796 // The basic block and loop containing the predicated instruction. 3797 auto *PredBB = PredInst->getParent(); 3798 auto *VectorLoop = LI->getLoopFor(PredBB); 3799 3800 // Initialize a worklist with the operands of the predicated instruction. 3801 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3802 3803 // Holds instructions that we need to analyze again. An instruction may be 3804 // reanalyzed if we don't yet know if we can sink it or not. 3805 SmallVector<Instruction *, 8> InstsToReanalyze; 3806 3807 // Returns true if a given use occurs in the predicated block. Phi nodes use 3808 // their operands in their corresponding predecessor blocks. 3809 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3810 auto *I = cast<Instruction>(U.getUser()); 3811 BasicBlock *BB = I->getParent(); 3812 if (auto *Phi = dyn_cast<PHINode>(I)) 3813 BB = Phi->getIncomingBlock( 3814 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3815 return BB == PredBB; 3816 }; 3817 3818 // Iteratively sink the scalarized operands of the predicated instruction 3819 // into the block we created for it. When an instruction is sunk, it's 3820 // operands are then added to the worklist. The algorithm ends after one pass 3821 // through the worklist doesn't sink a single instruction. 3822 bool Changed; 3823 do { 3824 // Add the instructions that need to be reanalyzed to the worklist, and 3825 // reset the changed indicator. 3826 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3827 InstsToReanalyze.clear(); 3828 Changed = false; 3829 3830 while (!Worklist.empty()) { 3831 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3832 3833 // We can't sink an instruction if it is a phi node, is already in the 3834 // predicated block, is not in the loop, or may have side effects. 3835 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 3836 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 3837 continue; 3838 3839 // It's legal to sink the instruction if all its uses occur in the 3840 // predicated block. Otherwise, there's nothing to do yet, and we may 3841 // need to reanalyze the instruction. 3842 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 3843 InstsToReanalyze.push_back(I); 3844 continue; 3845 } 3846 3847 // Move the instruction to the beginning of the predicated block, and add 3848 // it's operands to the worklist. 3849 I->moveBefore(&*PredBB->getFirstInsertionPt()); 3850 Worklist.insert(I->op_begin(), I->op_end()); 3851 3852 // The sinking may have enabled other instructions to be sunk, so we will 3853 // need to iterate. 3854 Changed = true; 3855 } 3856 } while (Changed); 3857 } 3858 3859 void InnerLoopVectorizer::fixNonInductionPHIs() { 3860 for (PHINode *OrigPhi : OrigPHIsToFix) { 3861 PHINode *NewPhi = 3862 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 3863 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 3864 3865 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 3866 predecessors(OrigPhi->getParent())); 3867 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 3868 predecessors(NewPhi->getParent())); 3869 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 3870 "Scalar and Vector BB should have the same number of predecessors"); 3871 3872 // The insertion point in Builder may be invalidated by the time we get 3873 // here. Force the Builder insertion point to something valid so that we do 3874 // not run into issues during insertion point restore in 3875 // getOrCreateVectorValue calls below. 3876 Builder.SetInsertPoint(NewPhi); 3877 3878 // The predecessor order is preserved and we can rely on mapping between 3879 // scalar and vector block predecessors. 3880 for (unsigned i = 0; i < NumIncomingValues; ++i) { 3881 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 3882 3883 // When looking up the new scalar/vector values to fix up, use incoming 3884 // values from original phi. 3885 Value *ScIncV = 3886 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 3887 3888 // Scalar incoming value may need a broadcast 3889 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 3890 NewPhi->addIncoming(NewIncV, NewPredBB); 3891 } 3892 } 3893 } 3894 3895 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 3896 unsigned VF) { 3897 PHINode *P = cast<PHINode>(PN); 3898 if (EnableVPlanNativePath) { 3899 // Currently we enter here in the VPlan-native path for non-induction 3900 // PHIs where all control flow is uniform. We simply widen these PHIs. 3901 // Create a vector phi with no operands - the vector phi operands will be 3902 // set at the end of vector code generation. 3903 Type *VecTy = 3904 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 3905 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 3906 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 3907 OrigPHIsToFix.push_back(P); 3908 3909 return; 3910 } 3911 3912 assert(PN->getParent() == OrigLoop->getHeader() && 3913 "Non-header phis should have been handled elsewhere"); 3914 3915 // In order to support recurrences we need to be able to vectorize Phi nodes. 3916 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3917 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 3918 // this value when we vectorize all of the instructions that use the PHI. 3919 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 3920 for (unsigned Part = 0; Part < UF; ++Part) { 3921 // This is phase one of vectorizing PHIs. 3922 Type *VecTy = 3923 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 3924 Value *EntryPart = PHINode::Create( 3925 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 3926 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 3927 } 3928 return; 3929 } 3930 3931 setDebugLocFromInst(Builder, P); 3932 3933 // This PHINode must be an induction variable. 3934 // Make sure that we know about it. 3935 assert(Legal->getInductionVars()->count(P) && "Not an induction variable"); 3936 3937 InductionDescriptor II = Legal->getInductionVars()->lookup(P); 3938 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 3939 3940 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 3941 // which can be found from the original scalar operations. 3942 switch (II.getKind()) { 3943 case InductionDescriptor::IK_NoInduction: 3944 llvm_unreachable("Unknown induction"); 3945 case InductionDescriptor::IK_IntInduction: 3946 case InductionDescriptor::IK_FpInduction: 3947 llvm_unreachable("Integer/fp induction is handled elsewhere."); 3948 case InductionDescriptor::IK_PtrInduction: { 3949 // Handle the pointer induction variable case. 3950 assert(P->getType()->isPointerTy() && "Unexpected type."); 3951 // This is the normalized GEP that starts counting at zero. 3952 Value *PtrInd = Induction; 3953 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 3954 // Determine the number of scalars we need to generate for each unroll 3955 // iteration. If the instruction is uniform, we only need to generate the 3956 // first lane. Otherwise, we generate all VF values. 3957 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 3958 // These are the scalar results. Notice that we don't generate vector GEPs 3959 // because scalar GEPs result in better code. 3960 for (unsigned Part = 0; Part < UF; ++Part) { 3961 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 3962 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 3963 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 3964 Value *SclrGep = 3965 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 3966 SclrGep->setName("next.gep"); 3967 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 3968 } 3969 } 3970 return; 3971 } 3972 } 3973 } 3974 3975 /// A helper function for checking whether an integer division-related 3976 /// instruction may divide by zero (in which case it must be predicated if 3977 /// executed conditionally in the scalar code). 3978 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 3979 /// Non-zero divisors that are non compile-time constants will not be 3980 /// converted into multiplication, so we will still end up scalarizing 3981 /// the division, but can do so w/o predication. 3982 static bool mayDivideByZero(Instruction &I) { 3983 assert((I.getOpcode() == Instruction::UDiv || 3984 I.getOpcode() == Instruction::SDiv || 3985 I.getOpcode() == Instruction::URem || 3986 I.getOpcode() == Instruction::SRem) && 3987 "Unexpected instruction"); 3988 Value *Divisor = I.getOperand(1); 3989 auto *CInt = dyn_cast<ConstantInt>(Divisor); 3990 return !CInt || CInt->isZero(); 3991 } 3992 3993 void InnerLoopVectorizer::widenInstruction(Instruction &I) { 3994 switch (I.getOpcode()) { 3995 case Instruction::Br: 3996 case Instruction::PHI: 3997 llvm_unreachable("This instruction is handled by a different recipe."); 3998 case Instruction::GetElementPtr: { 3999 // Construct a vector GEP by widening the operands of the scalar GEP as 4000 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4001 // results in a vector of pointers when at least one operand of the GEP 4002 // is vector-typed. Thus, to keep the representation compact, we only use 4003 // vector-typed operands for loop-varying values. 4004 auto *GEP = cast<GetElementPtrInst>(&I); 4005 4006 if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) { 4007 // If we are vectorizing, but the GEP has only loop-invariant operands, 4008 // the GEP we build (by only using vector-typed operands for 4009 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4010 // produce a vector of pointers, we need to either arbitrarily pick an 4011 // operand to broadcast, or broadcast a clone of the original GEP. 4012 // Here, we broadcast a clone of the original. 4013 // 4014 // TODO: If at some point we decide to scalarize instructions having 4015 // loop-invariant operands, this special case will no longer be 4016 // required. We would add the scalarization decision to 4017 // collectLoopScalars() and teach getVectorValue() to broadcast 4018 // the lane-zero scalar value. 4019 auto *Clone = Builder.Insert(GEP->clone()); 4020 for (unsigned Part = 0; Part < UF; ++Part) { 4021 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4022 VectorLoopValueMap.setVectorValue(&I, Part, EntryPart); 4023 addMetadata(EntryPart, GEP); 4024 } 4025 } else { 4026 // If the GEP has at least one loop-varying operand, we are sure to 4027 // produce a vector of pointers. But if we are only unrolling, we want 4028 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4029 // produce with the code below will be scalar (if VF == 1) or vector 4030 // (otherwise). Note that for the unroll-only case, we still maintain 4031 // values in the vector mapping with initVector, as we do for other 4032 // instructions. 4033 for (unsigned Part = 0; Part < UF; ++Part) { 4034 // The pointer operand of the new GEP. If it's loop-invariant, we 4035 // won't broadcast it. 4036 auto *Ptr = 4037 OrigLoop->isLoopInvariant(GEP->getPointerOperand()) 4038 ? GEP->getPointerOperand() 4039 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4040 4041 // Collect all the indices for the new GEP. If any index is 4042 // loop-invariant, we won't broadcast it. 4043 SmallVector<Value *, 4> Indices; 4044 for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) { 4045 if (OrigLoop->isLoopInvariant(U.get())) 4046 Indices.push_back(U.get()); 4047 else 4048 Indices.push_back(getOrCreateVectorValue(U.get(), Part)); 4049 } 4050 4051 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4052 // but it should be a vector, otherwise. 4053 auto *NewGEP = 4054 GEP->isInBounds() 4055 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4056 Indices) 4057 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4058 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4059 "NewGEP is not a pointer vector"); 4060 VectorLoopValueMap.setVectorValue(&I, Part, NewGEP); 4061 addMetadata(NewGEP, GEP); 4062 } 4063 } 4064 4065 break; 4066 } 4067 case Instruction::UDiv: 4068 case Instruction::SDiv: 4069 case Instruction::SRem: 4070 case Instruction::URem: 4071 case Instruction::Add: 4072 case Instruction::FAdd: 4073 case Instruction::Sub: 4074 case Instruction::FSub: 4075 case Instruction::FNeg: 4076 case Instruction::Mul: 4077 case Instruction::FMul: 4078 case Instruction::FDiv: 4079 case Instruction::FRem: 4080 case Instruction::Shl: 4081 case Instruction::LShr: 4082 case Instruction::AShr: 4083 case Instruction::And: 4084 case Instruction::Or: 4085 case Instruction::Xor: { 4086 // Just widen unops and binops. 4087 setDebugLocFromInst(Builder, &I); 4088 4089 for (unsigned Part = 0; Part < UF; ++Part) { 4090 SmallVector<Value *, 2> Ops; 4091 for (Value *Op : I.operands()) 4092 Ops.push_back(getOrCreateVectorValue(Op, Part)); 4093 4094 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4095 4096 if (auto *VecOp = dyn_cast<Instruction>(V)) 4097 VecOp->copyIRFlags(&I); 4098 4099 // Use this vector value for all users of the original instruction. 4100 VectorLoopValueMap.setVectorValue(&I, Part, V); 4101 addMetadata(V, &I); 4102 } 4103 4104 break; 4105 } 4106 case Instruction::Select: { 4107 // Widen selects. 4108 // If the selector is loop invariant we can create a select 4109 // instruction with a scalar condition. Otherwise, use vector-select. 4110 auto *SE = PSE.getSE(); 4111 bool InvariantCond = 4112 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); 4113 setDebugLocFromInst(Builder, &I); 4114 4115 // The condition can be loop invariant but still defined inside the 4116 // loop. This means that we can't just use the original 'cond' value. 4117 // We have to take the 'vectorized' value and pick the first lane. 4118 // Instcombine will make this a no-op. 4119 4120 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); 4121 4122 for (unsigned Part = 0; Part < UF; ++Part) { 4123 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); 4124 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); 4125 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); 4126 Value *Sel = 4127 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); 4128 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4129 addMetadata(Sel, &I); 4130 } 4131 4132 break; 4133 } 4134 4135 case Instruction::ICmp: 4136 case Instruction::FCmp: { 4137 // Widen compares. Generate vector compares. 4138 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4139 auto *Cmp = dyn_cast<CmpInst>(&I); 4140 setDebugLocFromInst(Builder, Cmp); 4141 for (unsigned Part = 0; Part < UF; ++Part) { 4142 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); 4143 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); 4144 Value *C = nullptr; 4145 if (FCmp) { 4146 // Propagate fast math flags. 4147 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4148 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4149 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4150 } else { 4151 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4152 } 4153 VectorLoopValueMap.setVectorValue(&I, Part, C); 4154 addMetadata(C, &I); 4155 } 4156 4157 break; 4158 } 4159 4160 case Instruction::ZExt: 4161 case Instruction::SExt: 4162 case Instruction::FPToUI: 4163 case Instruction::FPToSI: 4164 case Instruction::FPExt: 4165 case Instruction::PtrToInt: 4166 case Instruction::IntToPtr: 4167 case Instruction::SIToFP: 4168 case Instruction::UIToFP: 4169 case Instruction::Trunc: 4170 case Instruction::FPTrunc: 4171 case Instruction::BitCast: { 4172 auto *CI = dyn_cast<CastInst>(&I); 4173 setDebugLocFromInst(Builder, CI); 4174 4175 /// Vectorize casts. 4176 Type *DestTy = 4177 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4178 4179 for (unsigned Part = 0; Part < UF; ++Part) { 4180 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); 4181 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4182 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4183 addMetadata(Cast, &I); 4184 } 4185 break; 4186 } 4187 4188 case Instruction::Call: { 4189 // Ignore dbg intrinsics. 4190 if (isa<DbgInfoIntrinsic>(I)) 4191 break; 4192 setDebugLocFromInst(Builder, &I); 4193 4194 Module *M = I.getParent()->getParent()->getParent(); 4195 auto *CI = cast<CallInst>(&I); 4196 4197 StringRef FnName = CI->getCalledFunction()->getName(); 4198 Function *F = CI->getCalledFunction(); 4199 Type *RetTy = ToVectorTy(CI->getType(), VF); 4200 SmallVector<Type *, 4> Tys; 4201 for (Value *ArgOperand : CI->arg_operands()) 4202 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4203 4204 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4205 4206 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4207 // version of the instruction. 4208 // Is it beneficial to perform intrinsic call compared to lib call? 4209 bool NeedToScalarize; 4210 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4211 bool UseVectorIntrinsic = 4212 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4213 assert((UseVectorIntrinsic || !NeedToScalarize) && 4214 "Instruction should be scalarized elsewhere."); 4215 4216 for (unsigned Part = 0; Part < UF; ++Part) { 4217 SmallVector<Value *, 4> Args; 4218 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 4219 Value *Arg = CI->getArgOperand(i); 4220 // Some intrinsics have a scalar argument - don't replace it with a 4221 // vector. 4222 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) 4223 Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); 4224 Args.push_back(Arg); 4225 } 4226 4227 Function *VectorF; 4228 if (UseVectorIntrinsic) { 4229 // Use vector version of the intrinsic. 4230 Type *TysForDecl[] = {CI->getType()}; 4231 if (VF > 1) 4232 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4233 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4234 } else { 4235 // Use vector version of the library call. 4236 StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); 4237 assert(!VFnName.empty() && "Vector function name is empty."); 4238 VectorF = M->getFunction(VFnName); 4239 if (!VectorF) { 4240 // Generate a declaration 4241 FunctionType *FTy = FunctionType::get(RetTy, Tys, false); 4242 VectorF = 4243 Function::Create(FTy, Function::ExternalLinkage, VFnName, M); 4244 VectorF->copyAttributesFrom(F); 4245 } 4246 } 4247 assert(VectorF && "Can't create vector function."); 4248 4249 SmallVector<OperandBundleDef, 1> OpBundles; 4250 CI->getOperandBundlesAsDefs(OpBundles); 4251 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4252 4253 if (isa<FPMathOperator>(V)) 4254 V->copyFastMathFlags(CI); 4255 4256 VectorLoopValueMap.setVectorValue(&I, Part, V); 4257 addMetadata(V, &I); 4258 } 4259 4260 break; 4261 } 4262 4263 default: 4264 // This instruction is not vectorized by simple widening. 4265 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4266 llvm_unreachable("Unhandled instruction!"); 4267 } // end of switch. 4268 } 4269 4270 void InnerLoopVectorizer::updateAnalysis() { 4271 // Forget the original basic block. 4272 PSE.getSE()->forgetLoop(OrigLoop); 4273 4274 // DT is not kept up-to-date for outer loop vectorization 4275 if (EnableVPlanNativePath) 4276 return; 4277 4278 // Update the dominator tree information. 4279 assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && 4280 "Entry does not dominate exit."); 4281 4282 DT->addNewBlock(LoopMiddleBlock, 4283 LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4284 DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); 4285 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 4286 DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); 4287 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 4288 } 4289 4290 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4291 // We should not collect Scalars more than once per VF. Right now, this 4292 // function is called from collectUniformsAndScalars(), which already does 4293 // this check. Collecting Scalars for VF=1 does not make any sense. 4294 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4295 "This function should not be visited twice for the same VF"); 4296 4297 SmallSetVector<Instruction *, 8> Worklist; 4298 4299 // These sets are used to seed the analysis with pointers used by memory 4300 // accesses that will remain scalar. 4301 SmallSetVector<Instruction *, 8> ScalarPtrs; 4302 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4303 4304 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4305 // The pointer operands of loads and stores will be scalar as long as the 4306 // memory access is not a gather or scatter operation. The value operand of a 4307 // store will remain scalar if the store is scalarized. 4308 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4309 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4310 assert(WideningDecision != CM_Unknown && 4311 "Widening decision should be ready at this moment"); 4312 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4313 if (Ptr == Store->getValueOperand()) 4314 return WideningDecision == CM_Scalarize; 4315 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4316 "Ptr is neither a value or pointer operand"); 4317 return WideningDecision != CM_GatherScatter; 4318 }; 4319 4320 // A helper that returns true if the given value is a bitcast or 4321 // getelementptr instruction contained in the loop. 4322 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4323 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4324 isa<GetElementPtrInst>(V)) && 4325 !TheLoop->isLoopInvariant(V); 4326 }; 4327 4328 // A helper that evaluates a memory access's use of a pointer. If the use 4329 // will be a scalar use, and the pointer is only used by memory accesses, we 4330 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4331 // PossibleNonScalarPtrs. 4332 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4333 // We only care about bitcast and getelementptr instructions contained in 4334 // the loop. 4335 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4336 return; 4337 4338 // If the pointer has already been identified as scalar (e.g., if it was 4339 // also identified as uniform), there's nothing to do. 4340 auto *I = cast<Instruction>(Ptr); 4341 if (Worklist.count(I)) 4342 return; 4343 4344 // If the use of the pointer will be a scalar use, and all users of the 4345 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4346 // place the pointer in PossibleNonScalarPtrs. 4347 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4348 return isa<LoadInst>(U) || isa<StoreInst>(U); 4349 })) 4350 ScalarPtrs.insert(I); 4351 else 4352 PossibleNonScalarPtrs.insert(I); 4353 }; 4354 4355 // We seed the scalars analysis with three classes of instructions: (1) 4356 // instructions marked uniform-after-vectorization, (2) bitcast and 4357 // getelementptr instructions used by memory accesses requiring a scalar use, 4358 // and (3) pointer induction variables and their update instructions (we 4359 // currently only scalarize these). 4360 // 4361 // (1) Add to the worklist all instructions that have been identified as 4362 // uniform-after-vectorization. 4363 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4364 4365 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4366 // memory accesses requiring a scalar use. The pointer operands of loads and 4367 // stores will be scalar as long as the memory accesses is not a gather or 4368 // scatter operation. The value operand of a store will remain scalar if the 4369 // store is scalarized. 4370 for (auto *BB : TheLoop->blocks()) 4371 for (auto &I : *BB) { 4372 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4373 evaluatePtrUse(Load, Load->getPointerOperand()); 4374 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4375 evaluatePtrUse(Store, Store->getPointerOperand()); 4376 evaluatePtrUse(Store, Store->getValueOperand()); 4377 } 4378 } 4379 for (auto *I : ScalarPtrs) 4380 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { 4381 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4382 Worklist.insert(I); 4383 } 4384 4385 // (3) Add to the worklist all pointer induction variables and their update 4386 // instructions. 4387 // 4388 // TODO: Once we are able to vectorize pointer induction variables we should 4389 // no longer insert them into the worklist here. 4390 auto *Latch = TheLoop->getLoopLatch(); 4391 for (auto &Induction : *Legal->getInductionVars()) { 4392 auto *Ind = Induction.first; 4393 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4394 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4395 continue; 4396 Worklist.insert(Ind); 4397 Worklist.insert(IndUpdate); 4398 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4399 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4400 << "\n"); 4401 } 4402 4403 // Insert the forced scalars. 4404 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4405 // induction variable when the PHI user is scalarized. 4406 auto ForcedScalar = ForcedScalars.find(VF); 4407 if (ForcedScalar != ForcedScalars.end()) 4408 for (auto *I : ForcedScalar->second) 4409 Worklist.insert(I); 4410 4411 // Expand the worklist by looking through any bitcasts and getelementptr 4412 // instructions we've already identified as scalar. This is similar to the 4413 // expansion step in collectLoopUniforms(); however, here we're only 4414 // expanding to include additional bitcasts and getelementptr instructions. 4415 unsigned Idx = 0; 4416 while (Idx != Worklist.size()) { 4417 Instruction *Dst = Worklist[Idx++]; 4418 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4419 continue; 4420 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4421 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4422 auto *J = cast<Instruction>(U); 4423 return !TheLoop->contains(J) || Worklist.count(J) || 4424 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4425 isScalarUse(J, Src)); 4426 })) { 4427 Worklist.insert(Src); 4428 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4429 } 4430 } 4431 4432 // An induction variable will remain scalar if all users of the induction 4433 // variable and induction variable update remain scalar. 4434 for (auto &Induction : *Legal->getInductionVars()) { 4435 auto *Ind = Induction.first; 4436 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4437 4438 // We already considered pointer induction variables, so there's no reason 4439 // to look at their users again. 4440 // 4441 // TODO: Once we are able to vectorize pointer induction variables we 4442 // should no longer skip over them here. 4443 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4444 continue; 4445 4446 // Determine if all users of the induction variable are scalar after 4447 // vectorization. 4448 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4449 auto *I = cast<Instruction>(U); 4450 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4451 }); 4452 if (!ScalarInd) 4453 continue; 4454 4455 // Determine if all users of the induction variable update instruction are 4456 // scalar after vectorization. 4457 auto ScalarIndUpdate = 4458 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4459 auto *I = cast<Instruction>(U); 4460 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4461 }); 4462 if (!ScalarIndUpdate) 4463 continue; 4464 4465 // The induction variable and its update instruction will remain scalar. 4466 Worklist.insert(Ind); 4467 Worklist.insert(IndUpdate); 4468 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4469 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4470 << "\n"); 4471 } 4472 4473 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4474 } 4475 4476 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4477 if (!blockNeedsPredication(I->getParent())) 4478 return false; 4479 switch(I->getOpcode()) { 4480 default: 4481 break; 4482 case Instruction::Load: 4483 case Instruction::Store: { 4484 if (!Legal->isMaskRequired(I)) 4485 return false; 4486 auto *Ptr = getLoadStorePointerOperand(I); 4487 auto *Ty = getMemInstValueType(I); 4488 // We have already decided how to vectorize this instruction, get that 4489 // result. 4490 if (VF > 1) { 4491 InstWidening WideningDecision = getWideningDecision(I, VF); 4492 assert(WideningDecision != CM_Unknown && 4493 "Widening decision should be ready at this moment"); 4494 return WideningDecision == CM_Scalarize; 4495 } 4496 return isa<LoadInst>(I) ? 4497 !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty)) 4498 : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty)); 4499 } 4500 case Instruction::UDiv: 4501 case Instruction::SDiv: 4502 case Instruction::SRem: 4503 case Instruction::URem: 4504 return mayDivideByZero(*I); 4505 } 4506 return false; 4507 } 4508 4509 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4510 unsigned VF) { 4511 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4512 assert(getWideningDecision(I, VF) == CM_Unknown && 4513 "Decision should not be set yet."); 4514 auto *Group = getInterleavedAccessGroup(I); 4515 assert(Group && "Must have a group."); 4516 4517 // If the instruction's allocated size doesn't equal it's type size, it 4518 // requires padding and will be scalarized. 4519 auto &DL = I->getModule()->getDataLayout(); 4520 auto *ScalarTy = getMemInstValueType(I); 4521 if (hasIrregularType(ScalarTy, DL, VF)) 4522 return false; 4523 4524 // Check if masking is required. 4525 // A Group may need masking for one of two reasons: it resides in a block that 4526 // needs predication, or it was decided to use masking to deal with gaps. 4527 bool PredicatedAccessRequiresMasking = 4528 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4529 bool AccessWithGapsRequiresMasking = 4530 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4531 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4532 return true; 4533 4534 // If masked interleaving is required, we expect that the user/target had 4535 // enabled it, because otherwise it either wouldn't have been created or 4536 // it should have been invalidated by the CostModel. 4537 assert(useMaskedInterleavedAccesses(TTI) && 4538 "Masked interleave-groups for predicated accesses are not enabled."); 4539 4540 auto *Ty = getMemInstValueType(I); 4541 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty) 4542 : TTI.isLegalMaskedStore(Ty); 4543 } 4544 4545 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4546 unsigned VF) { 4547 // Get and ensure we have a valid memory instruction. 4548 LoadInst *LI = dyn_cast<LoadInst>(I); 4549 StoreInst *SI = dyn_cast<StoreInst>(I); 4550 assert((LI || SI) && "Invalid memory instruction"); 4551 4552 auto *Ptr = getLoadStorePointerOperand(I); 4553 4554 // In order to be widened, the pointer should be consecutive, first of all. 4555 if (!Legal->isConsecutivePtr(Ptr)) 4556 return false; 4557 4558 // If the instruction is a store located in a predicated block, it will be 4559 // scalarized. 4560 if (isScalarWithPredication(I)) 4561 return false; 4562 4563 // If the instruction's allocated size doesn't equal it's type size, it 4564 // requires padding and will be scalarized. 4565 auto &DL = I->getModule()->getDataLayout(); 4566 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4567 if (hasIrregularType(ScalarTy, DL, VF)) 4568 return false; 4569 4570 return true; 4571 } 4572 4573 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4574 // We should not collect Uniforms more than once per VF. Right now, 4575 // this function is called from collectUniformsAndScalars(), which 4576 // already does this check. Collecting Uniforms for VF=1 does not make any 4577 // sense. 4578 4579 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4580 "This function should not be visited twice for the same VF"); 4581 4582 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4583 // not analyze again. Uniforms.count(VF) will return 1. 4584 Uniforms[VF].clear(); 4585 4586 // We now know that the loop is vectorizable! 4587 // Collect instructions inside the loop that will remain uniform after 4588 // vectorization. 4589 4590 // Global values, params and instructions outside of current loop are out of 4591 // scope. 4592 auto isOutOfScope = [&](Value *V) -> bool { 4593 Instruction *I = dyn_cast<Instruction>(V); 4594 return (!I || !TheLoop->contains(I)); 4595 }; 4596 4597 SetVector<Instruction *> Worklist; 4598 BasicBlock *Latch = TheLoop->getLoopLatch(); 4599 4600 // Start with the conditional branch. If the branch condition is an 4601 // instruction contained in the loop that is only used by the branch, it is 4602 // uniform. 4603 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4604 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) { 4605 Worklist.insert(Cmp); 4606 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); 4607 } 4608 4609 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4610 // are pointers that are treated like consecutive pointers during 4611 // vectorization. The pointer operands of interleaved accesses are an 4612 // example. 4613 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4614 4615 // Holds pointer operands of instructions that are possibly non-uniform. 4616 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4617 4618 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4619 InstWidening WideningDecision = getWideningDecision(I, VF); 4620 assert(WideningDecision != CM_Unknown && 4621 "Widening decision should be ready at this moment"); 4622 4623 return (WideningDecision == CM_Widen || 4624 WideningDecision == CM_Widen_Reverse || 4625 WideningDecision == CM_Interleave); 4626 }; 4627 // Iterate over the instructions in the loop, and collect all 4628 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4629 // that a consecutive-like pointer operand will be scalarized, we collect it 4630 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4631 // getelementptr instruction can be used by both vectorized and scalarized 4632 // memory instructions. For example, if a loop loads and stores from the same 4633 // location, but the store is conditional, the store will be scalarized, and 4634 // the getelementptr won't remain uniform. 4635 for (auto *BB : TheLoop->blocks()) 4636 for (auto &I : *BB) { 4637 // If there's no pointer operand, there's nothing to do. 4638 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4639 if (!Ptr) 4640 continue; 4641 4642 // True if all users of Ptr are memory accesses that have Ptr as their 4643 // pointer operand. 4644 auto UsersAreMemAccesses = 4645 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4646 return getLoadStorePointerOperand(U) == Ptr; 4647 }); 4648 4649 // Ensure the memory instruction will not be scalarized or used by 4650 // gather/scatter, making its pointer operand non-uniform. If the pointer 4651 // operand is used by any instruction other than a memory access, we 4652 // conservatively assume the pointer operand may be non-uniform. 4653 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4654 PossibleNonUniformPtrs.insert(Ptr); 4655 4656 // If the memory instruction will be vectorized and its pointer operand 4657 // is consecutive-like, or interleaving - the pointer operand should 4658 // remain uniform. 4659 else 4660 ConsecutiveLikePtrs.insert(Ptr); 4661 } 4662 4663 // Add to the Worklist all consecutive and consecutive-like pointers that 4664 // aren't also identified as possibly non-uniform. 4665 for (auto *V : ConsecutiveLikePtrs) 4666 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) { 4667 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n"); 4668 Worklist.insert(V); 4669 } 4670 4671 // Expand Worklist in topological order: whenever a new instruction 4672 // is added , its users should be already inside Worklist. It ensures 4673 // a uniform instruction will only be used by uniform instructions. 4674 unsigned idx = 0; 4675 while (idx != Worklist.size()) { 4676 Instruction *I = Worklist[idx++]; 4677 4678 for (auto OV : I->operand_values()) { 4679 // isOutOfScope operands cannot be uniform instructions. 4680 if (isOutOfScope(OV)) 4681 continue; 4682 // First order recurrence Phi's should typically be considered 4683 // non-uniform. 4684 auto *OP = dyn_cast<PHINode>(OV); 4685 if (OP && Legal->isFirstOrderRecurrence(OP)) 4686 continue; 4687 // If all the users of the operand are uniform, then add the 4688 // operand into the uniform worklist. 4689 auto *OI = cast<Instruction>(OV); 4690 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4691 auto *J = cast<Instruction>(U); 4692 return Worklist.count(J) || 4693 (OI == getLoadStorePointerOperand(J) && 4694 isUniformDecision(J, VF)); 4695 })) { 4696 Worklist.insert(OI); 4697 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); 4698 } 4699 } 4700 } 4701 4702 // Returns true if Ptr is the pointer operand of a memory access instruction 4703 // I, and I is known to not require scalarization. 4704 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4705 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4706 }; 4707 4708 // For an instruction to be added into Worklist above, all its users inside 4709 // the loop should also be in Worklist. However, this condition cannot be 4710 // true for phi nodes that form a cyclic dependence. We must process phi 4711 // nodes separately. An induction variable will remain uniform if all users 4712 // of the induction variable and induction variable update remain uniform. 4713 // The code below handles both pointer and non-pointer induction variables. 4714 for (auto &Induction : *Legal->getInductionVars()) { 4715 auto *Ind = Induction.first; 4716 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4717 4718 // Determine if all users of the induction variable are uniform after 4719 // vectorization. 4720 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4721 auto *I = cast<Instruction>(U); 4722 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4723 isVectorizedMemAccessUse(I, Ind); 4724 }); 4725 if (!UniformInd) 4726 continue; 4727 4728 // Determine if all users of the induction variable update instruction are 4729 // uniform after vectorization. 4730 auto UniformIndUpdate = 4731 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4732 auto *I = cast<Instruction>(U); 4733 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4734 isVectorizedMemAccessUse(I, IndUpdate); 4735 }); 4736 if (!UniformIndUpdate) 4737 continue; 4738 4739 // The induction variable and its update instruction will remain uniform. 4740 Worklist.insert(Ind); 4741 Worklist.insert(IndUpdate); 4742 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n"); 4743 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate 4744 << "\n"); 4745 } 4746 4747 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4748 } 4749 4750 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4751 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4752 4753 if (Legal->getRuntimePointerChecking()->Need) { 4754 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4755 "runtime pointer checks needed. Enable vectorization of this " 4756 "loop with '#pragma clang loop vectorize(enable)' when " 4757 "compiling with -Os/-Oz", 4758 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4759 return true; 4760 } 4761 4762 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4763 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4764 "runtime SCEV checks needed. Enable vectorization of this " 4765 "loop with '#pragma clang loop vectorize(enable)' when " 4766 "compiling with -Os/-Oz", 4767 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4768 return true; 4769 } 4770 4771 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4772 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4773 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4774 "runtime stride == 1 checks needed. Enable vectorization of " 4775 "this loop with '#pragma clang loop vectorize(enable)' when " 4776 "compiling with -Os/-Oz", 4777 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4778 return true; 4779 } 4780 4781 return false; 4782 } 4783 4784 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { 4785 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4786 // TODO: It may by useful to do since it's still likely to be dynamically 4787 // uniform if the target can skip. 4788 reportVectorizationFailure( 4789 "Not inserting runtime ptr check for divergent target", 4790 "runtime pointer checks needed. Not enabled for divergent target", 4791 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4792 return None; 4793 } 4794 4795 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4796 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4797 if (TC == 1) { 4798 reportVectorizationFailure("Single iteration (non) loop", 4799 "loop trip count is one, irrelevant for vectorization", 4800 "SingleIterationLoop", ORE, TheLoop); 4801 return None; 4802 } 4803 4804 switch (ScalarEpilogueStatus) { 4805 case CM_ScalarEpilogueAllowed: 4806 return computeFeasibleMaxVF(TC); 4807 case CM_ScalarEpilogueNotNeededPredicatePragma: 4808 LLVM_DEBUG( 4809 dbgs() << "LV: vector predicate hint found.\n" 4810 << "LV: Not allowing scalar epilogue, creating predicated " 4811 << "vector loop.\n"); 4812 break; 4813 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4814 // fallthrough as a special case of OptForSize 4815 case CM_ScalarEpilogueNotAllowedOptSize: 4816 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4817 LLVM_DEBUG( 4818 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4819 else 4820 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4821 << "count.\n"); 4822 4823 // Bail if runtime checks are required, which are not good when optimising 4824 // for size. 4825 if (runtimeChecksRequired()) 4826 return None; 4827 break; 4828 } 4829 4830 // Now try the tail folding 4831 4832 // Invalidate interleave groups that require an epilogue if we can't mask 4833 // the interleave-group. 4834 if (!useMaskedInterleavedAccesses(TTI)) 4835 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4836 4837 unsigned MaxVF = computeFeasibleMaxVF(TC); 4838 if (TC > 0 && TC % MaxVF == 0) { 4839 // Accept MaxVF if we do not have a tail. 4840 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4841 return MaxVF; 4842 } 4843 4844 // If we don't know the precise trip count, or if the trip count that we 4845 // found modulo the vectorization factor is not zero, try to fold the tail 4846 // by masking. 4847 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4848 if (Legal->canFoldTailByMasking()) { 4849 FoldTailByMasking = true; 4850 return MaxVF; 4851 } 4852 4853 if (TC == 0) { 4854 reportVectorizationFailure( 4855 "Unable to calculate the loop count due to complex control flow", 4856 "unable to calculate the loop count due to complex control flow", 4857 "UnknownLoopCountComplexCFG", ORE, TheLoop); 4858 return None; 4859 } 4860 4861 reportVectorizationFailure( 4862 "Cannot optimize for size and vectorize at the same time.", 4863 "cannot optimize for size and vectorize at the same time. " 4864 "Enable vectorization of this loop with '#pragma clang loop " 4865 "vectorize(enable)' when compiling with -Os/-Oz", 4866 "NoTailLoopWithOptForSize", ORE, TheLoop); 4867 return None; 4868 } 4869 4870 unsigned 4871 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 4872 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4873 unsigned SmallestType, WidestType; 4874 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4875 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 4876 4877 // Get the maximum safe dependence distance in bits computed by LAA. 4878 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4879 // the memory accesses that is most restrictive (involved in the smallest 4880 // dependence distance). 4881 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 4882 4883 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 4884 4885 unsigned MaxVectorSize = WidestRegister / WidestType; 4886 4887 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4888 << " / " << WidestType << " bits.\n"); 4889 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 4890 << WidestRegister << " bits.\n"); 4891 4892 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 4893 " into one vector!"); 4894 if (MaxVectorSize == 0) { 4895 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 4896 MaxVectorSize = 1; 4897 return MaxVectorSize; 4898 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 4899 isPowerOf2_32(ConstTripCount)) { 4900 // We need to clamp the VF to be the ConstTripCount. There is no point in 4901 // choosing a higher viable VF as done in the loop below. 4902 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 4903 << ConstTripCount << "\n"); 4904 MaxVectorSize = ConstTripCount; 4905 return MaxVectorSize; 4906 } 4907 4908 unsigned MaxVF = MaxVectorSize; 4909 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 4910 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 4911 // Collect all viable vectorization factors larger than the default MaxVF 4912 // (i.e. MaxVectorSize). 4913 SmallVector<unsigned, 8> VFs; 4914 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 4915 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 4916 VFs.push_back(VS); 4917 4918 // For each VF calculate its register usage. 4919 auto RUs = calculateRegisterUsage(VFs); 4920 4921 // Select the largest VF which doesn't require more registers than existing 4922 // ones. 4923 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); 4924 for (int i = RUs.size() - 1; i >= 0; --i) { 4925 if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { 4926 MaxVF = VFs[i]; 4927 break; 4928 } 4929 } 4930 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 4931 if (MaxVF < MinVF) { 4932 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 4933 << ") with target's minimum: " << MinVF << '\n'); 4934 MaxVF = MinVF; 4935 } 4936 } 4937 } 4938 return MaxVF; 4939 } 4940 4941 VectorizationFactor 4942 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 4943 float Cost = expectedCost(1).first; 4944 const float ScalarCost = Cost; 4945 unsigned Width = 1; 4946 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 4947 4948 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 4949 if (ForceVectorization && MaxVF > 1) { 4950 // Ignore scalar width, because the user explicitly wants vectorization. 4951 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 4952 // evaluation. 4953 Cost = std::numeric_limits<float>::max(); 4954 } 4955 4956 for (unsigned i = 2; i <= MaxVF; i *= 2) { 4957 // Notice that the vector loop needs to be executed less times, so 4958 // we need to divide the cost of the vector loops by the width of 4959 // the vector elements. 4960 VectorizationCostTy C = expectedCost(i); 4961 float VectorCost = C.first / (float)i; 4962 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 4963 << " costs: " << (int)VectorCost << ".\n"); 4964 if (!C.second && !ForceVectorization) { 4965 LLVM_DEBUG( 4966 dbgs() << "LV: Not considering vector loop of width " << i 4967 << " because it will not generate any vector instructions.\n"); 4968 continue; 4969 } 4970 if (VectorCost < Cost) { 4971 Cost = VectorCost; 4972 Width = i; 4973 } 4974 } 4975 4976 if (!EnableCondStoresVectorization && NumPredStores) { 4977 reportVectorizationFailure("There are conditional stores.", 4978 "store that is conditionally executed prevents vectorization", 4979 "ConditionalStore", ORE, TheLoop); 4980 Width = 1; 4981 Cost = ScalarCost; 4982 } 4983 4984 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 4985 << "LV: Vectorization seems to be not beneficial, " 4986 << "but was forced by a user.\n"); 4987 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 4988 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 4989 return Factor; 4990 } 4991 4992 std::pair<unsigned, unsigned> 4993 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 4994 unsigned MinWidth = -1U; 4995 unsigned MaxWidth = 8; 4996 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 4997 4998 // For each block. 4999 for (BasicBlock *BB : TheLoop->blocks()) { 5000 // For each instruction in the loop. 5001 for (Instruction &I : BB->instructionsWithoutDebug()) { 5002 Type *T = I.getType(); 5003 5004 // Skip ignored values. 5005 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) 5006 continue; 5007 5008 // Only examine Loads, Stores and PHINodes. 5009 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5010 continue; 5011 5012 // Examine PHI nodes that are reduction variables. Update the type to 5013 // account for the recurrence type. 5014 if (auto *PN = dyn_cast<PHINode>(&I)) { 5015 if (!Legal->isReductionVariable(PN)) 5016 continue; 5017 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; 5018 T = RdxDesc.getRecurrenceType(); 5019 } 5020 5021 // Examine the stored values. 5022 if (auto *ST = dyn_cast<StoreInst>(&I)) 5023 T = ST->getValueOperand()->getType(); 5024 5025 // Ignore loaded pointer types and stored pointer types that are not 5026 // vectorizable. 5027 // 5028 // FIXME: The check here attempts to predict whether a load or store will 5029 // be vectorized. We only know this for certain after a VF has 5030 // been selected. Here, we assume that if an access can be 5031 // vectorized, it will be. We should also look at extending this 5032 // optimization to non-pointer types. 5033 // 5034 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5035 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5036 continue; 5037 5038 MinWidth = std::min(MinWidth, 5039 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5040 MaxWidth = std::max(MaxWidth, 5041 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5042 } 5043 } 5044 5045 return {MinWidth, MaxWidth}; 5046 } 5047 5048 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5049 unsigned LoopCost) { 5050 // -- The interleave heuristics -- 5051 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5052 // There are many micro-architectural considerations that we can't predict 5053 // at this level. For example, frontend pressure (on decode or fetch) due to 5054 // code size, or the number and capabilities of the execution ports. 5055 // 5056 // We use the following heuristics to select the interleave count: 5057 // 1. If the code has reductions, then we interleave to break the cross 5058 // iteration dependency. 5059 // 2. If the loop is really small, then we interleave to reduce the loop 5060 // overhead. 5061 // 3. We don't interleave if we think that we will spill registers to memory 5062 // due to the increased register pressure. 5063 5064 if (!isScalarEpilogueAllowed()) 5065 return 1; 5066 5067 // We used the distance for the interleave count. 5068 if (Legal->getMaxSafeDepDistBytes() != -1U) 5069 return 1; 5070 5071 // Do not interleave loops with a relatively small trip count. 5072 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5073 if (TC > 1 && TC < TinyTripCountInterleaveThreshold) 5074 return 1; 5075 5076 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); 5077 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5078 << " registers\n"); 5079 5080 if (VF == 1) { 5081 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5082 TargetNumRegisters = ForceTargetNumScalarRegs; 5083 } else { 5084 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5085 TargetNumRegisters = ForceTargetNumVectorRegs; 5086 } 5087 5088 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5089 // We divide by these constants so assume that we have at least one 5090 // instruction that uses at least one register. 5091 R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); 5092 5093 // We calculate the interleave count using the following formula. 5094 // Subtract the number of loop invariants from the number of available 5095 // registers. These registers are used by all of the interleaved instances. 5096 // Next, divide the remaining registers by the number of registers that is 5097 // required by the loop, in order to estimate how many parallel instances 5098 // fit without causing spills. All of this is rounded down if necessary to be 5099 // a power of two. We want power of two interleave count to simplify any 5100 // addressing operations or alignment considerations. 5101 // We also want power of two interleave counts to ensure that the induction 5102 // variable of the vector loop wraps to zero, when tail is folded by masking; 5103 // this currently happens when OptForSize, in which case IC is set to 1 above. 5104 unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / 5105 R.MaxLocalUsers); 5106 5107 // Don't count the induction variable as interleaved. 5108 if (EnableIndVarRegisterHeur) 5109 IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / 5110 std::max(1U, (R.MaxLocalUsers - 1))); 5111 5112 // Clamp the interleave ranges to reasonable counts. 5113 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5114 5115 // Check if the user has overridden the max. 5116 if (VF == 1) { 5117 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5118 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5119 } else { 5120 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5121 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5122 } 5123 5124 // If we did not calculate the cost for VF (because the user selected the VF) 5125 // then we calculate the cost of VF here. 5126 if (LoopCost == 0) 5127 LoopCost = expectedCost(VF).first; 5128 5129 assert(LoopCost && "Non-zero loop cost expected"); 5130 5131 // Clamp the calculated IC to be between the 1 and the max interleave count 5132 // that the target allows. 5133 if (IC > MaxInterleaveCount) 5134 IC = MaxInterleaveCount; 5135 else if (IC < 1) 5136 IC = 1; 5137 5138 // Interleave if we vectorized this loop and there is a reduction that could 5139 // benefit from interleaving. 5140 if (VF > 1 && !Legal->getReductionVars()->empty()) { 5141 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5142 return IC; 5143 } 5144 5145 // Note that if we've already vectorized the loop we will have done the 5146 // runtime check and so interleaving won't require further checks. 5147 bool InterleavingRequiresRuntimePointerCheck = 5148 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5149 5150 // We want to interleave small loops in order to reduce the loop overhead and 5151 // potentially expose ILP opportunities. 5152 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5153 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5154 // We assume that the cost overhead is 1 and we use the cost model 5155 // to estimate the cost of the loop and interleave until the cost of the 5156 // loop overhead is about 5% of the cost of the loop. 5157 unsigned SmallIC = 5158 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5159 5160 // Interleave until store/load ports (estimated by max interleave count) are 5161 // saturated. 5162 unsigned NumStores = Legal->getNumStores(); 5163 unsigned NumLoads = Legal->getNumLoads(); 5164 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5165 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5166 5167 // If we have a scalar reduction (vector reductions are already dealt with 5168 // by this point), we can increase the critical path length if the loop 5169 // we're interleaving is inside another loop. Limit, by default to 2, so the 5170 // critical path only gets increased by one reduction operation. 5171 if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) { 5172 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5173 SmallIC = std::min(SmallIC, F); 5174 StoresIC = std::min(StoresIC, F); 5175 LoadsIC = std::min(LoadsIC, F); 5176 } 5177 5178 if (EnableLoadStoreRuntimeInterleave && 5179 std::max(StoresIC, LoadsIC) > SmallIC) { 5180 LLVM_DEBUG( 5181 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5182 return std::max(StoresIC, LoadsIC); 5183 } 5184 5185 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5186 return SmallIC; 5187 } 5188 5189 // Interleave if this is a large loop (small loops are already dealt with by 5190 // this point) that could benefit from interleaving. 5191 bool HasReductions = !Legal->getReductionVars()->empty(); 5192 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5193 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5194 return IC; 5195 } 5196 5197 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5198 return 1; 5199 } 5200 5201 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5202 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5203 // This function calculates the register usage by measuring the highest number 5204 // of values that are alive at a single location. Obviously, this is a very 5205 // rough estimation. We scan the loop in a topological order in order and 5206 // assign a number to each instruction. We use RPO to ensure that defs are 5207 // met before their users. We assume that each instruction that has in-loop 5208 // users starts an interval. We record every time that an in-loop value is 5209 // used, so we have a list of the first and last occurrences of each 5210 // instruction. Next, we transpose this data structure into a multi map that 5211 // holds the list of intervals that *end* at a specific location. This multi 5212 // map allows us to perform a linear search. We scan the instructions linearly 5213 // and record each time that a new interval starts, by placing it in a set. 5214 // If we find this value in the multi-map then we remove it from the set. 5215 // The max register usage is the maximum size of the set. 5216 // We also search for instructions that are defined outside the loop, but are 5217 // used inside the loop. We need this number separately from the max-interval 5218 // usage number because when we unroll, loop-invariant values do not take 5219 // more register. 5220 LoopBlocksDFS DFS(TheLoop); 5221 DFS.perform(LI); 5222 5223 RegisterUsage RU; 5224 5225 // Each 'key' in the map opens a new interval. The values 5226 // of the map are the index of the 'last seen' usage of the 5227 // instruction that is the key. 5228 using IntervalMap = DenseMap<Instruction *, unsigned>; 5229 5230 // Maps instruction to its index. 5231 SmallVector<Instruction *, 64> IdxToInstr; 5232 // Marks the end of each interval. 5233 IntervalMap EndPoint; 5234 // Saves the list of instruction indices that are used in the loop. 5235 SmallPtrSet<Instruction *, 8> Ends; 5236 // Saves the list of values that are used in the loop but are 5237 // defined outside the loop, such as arguments and constants. 5238 SmallPtrSet<Value *, 8> LoopInvariants; 5239 5240 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5241 for (Instruction &I : BB->instructionsWithoutDebug()) { 5242 IdxToInstr.push_back(&I); 5243 5244 // Save the end location of each USE. 5245 for (Value *U : I.operands()) { 5246 auto *Instr = dyn_cast<Instruction>(U); 5247 5248 // Ignore non-instruction values such as arguments, constants, etc. 5249 if (!Instr) 5250 continue; 5251 5252 // If this instruction is outside the loop then record it and continue. 5253 if (!TheLoop->contains(Instr)) { 5254 LoopInvariants.insert(Instr); 5255 continue; 5256 } 5257 5258 // Overwrite previous end points. 5259 EndPoint[Instr] = IdxToInstr.size(); 5260 Ends.insert(Instr); 5261 } 5262 } 5263 } 5264 5265 // Saves the list of intervals that end with the index in 'key'. 5266 using InstrList = SmallVector<Instruction *, 2>; 5267 DenseMap<unsigned, InstrList> TransposeEnds; 5268 5269 // Transpose the EndPoints to a list of values that end at each index. 5270 for (auto &Interval : EndPoint) 5271 TransposeEnds[Interval.second].push_back(Interval.first); 5272 5273 SmallPtrSet<Instruction *, 8> OpenIntervals; 5274 5275 // Get the size of the widest register. 5276 unsigned MaxSafeDepDist = -1U; 5277 if (Legal->getMaxSafeDepDistBytes() != -1U) 5278 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5279 unsigned WidestRegister = 5280 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5281 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5282 5283 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5284 SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0); 5285 5286 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5287 5288 // A lambda that gets the register usage for the given type and VF. 5289 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5290 if (Ty->isTokenTy()) 5291 return 0U; 5292 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5293 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5294 }; 5295 5296 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5297 Instruction *I = IdxToInstr[i]; 5298 5299 // Remove all of the instructions that end at this location. 5300 InstrList &List = TransposeEnds[i]; 5301 for (Instruction *ToRemove : List) 5302 OpenIntervals.erase(ToRemove); 5303 5304 // Ignore instructions that are never used within the loop. 5305 if (Ends.find(I) == Ends.end()) 5306 continue; 5307 5308 // Skip ignored values. 5309 if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) 5310 continue; 5311 5312 // For each VF find the maximum usage of registers. 5313 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5314 if (VFs[j] == 1) { 5315 MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); 5316 continue; 5317 } 5318 collectUniformsAndScalars(VFs[j]); 5319 // Count the number of live intervals. 5320 unsigned RegUsage = 0; 5321 for (auto Inst : OpenIntervals) { 5322 // Skip ignored values for VF > 1. 5323 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() || 5324 isScalarAfterVectorization(Inst, VFs[j])) 5325 continue; 5326 RegUsage += GetRegUsage(Inst->getType(), VFs[j]); 5327 } 5328 MaxUsages[j] = std::max(MaxUsages[j], RegUsage); 5329 } 5330 5331 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5332 << OpenIntervals.size() << '\n'); 5333 5334 // Add the current instruction to the list of open intervals. 5335 OpenIntervals.insert(I); 5336 } 5337 5338 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5339 unsigned Invariant = 0; 5340 if (VFs[i] == 1) 5341 Invariant = LoopInvariants.size(); 5342 else { 5343 for (auto Inst : LoopInvariants) 5344 Invariant += GetRegUsage(Inst->getType(), VFs[i]); 5345 } 5346 5347 LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); 5348 LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); 5349 LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant 5350 << '\n'); 5351 5352 RU.LoopInvariantRegs = Invariant; 5353 RU.MaxLocalUsers = MaxUsages[i]; 5354 RUs[i] = RU; 5355 } 5356 5357 return RUs; 5358 } 5359 5360 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5361 // TODO: Cost model for emulated masked load/store is completely 5362 // broken. This hack guides the cost model to use an artificially 5363 // high enough value to practically disable vectorization with such 5364 // operations, except where previously deployed legality hack allowed 5365 // using very low cost values. This is to avoid regressions coming simply 5366 // from moving "masked load/store" check from legality to cost model. 5367 // Masked Load/Gather emulation was previously never allowed. 5368 // Limited number of Masked Store/Scatter emulation was allowed. 5369 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5370 return isa<LoadInst>(I) || 5371 (isa<StoreInst>(I) && 5372 NumPredStores > NumberOfStoresToPredicate); 5373 } 5374 5375 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5376 // If we aren't vectorizing the loop, or if we've already collected the 5377 // instructions to scalarize, there's nothing to do. Collection may already 5378 // have occurred if we have a user-selected VF and are now computing the 5379 // expected cost for interleaving. 5380 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5381 return; 5382 5383 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5384 // not profitable to scalarize any instructions, the presence of VF in the 5385 // map will indicate that we've analyzed it already. 5386 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5387 5388 // Find all the instructions that are scalar with predication in the loop and 5389 // determine if it would be better to not if-convert the blocks they are in. 5390 // If so, we also record the instructions to scalarize. 5391 for (BasicBlock *BB : TheLoop->blocks()) { 5392 if (!blockNeedsPredication(BB)) 5393 continue; 5394 for (Instruction &I : *BB) 5395 if (isScalarWithPredication(&I)) { 5396 ScalarCostsTy ScalarCosts; 5397 // Do not apply discount logic if hacked cost is needed 5398 // for emulated masked memrefs. 5399 if (!useEmulatedMaskMemRefHack(&I) && 5400 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5401 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5402 // Remember that BB will remain after vectorization. 5403 PredicatedBBsAfterVectorization.insert(BB); 5404 } 5405 } 5406 } 5407 5408 int LoopVectorizationCostModel::computePredInstDiscount( 5409 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5410 unsigned VF) { 5411 assert(!isUniformAfterVectorization(PredInst, VF) && 5412 "Instruction marked uniform-after-vectorization will be predicated"); 5413 5414 // Initialize the discount to zero, meaning that the scalar version and the 5415 // vector version cost the same. 5416 int Discount = 0; 5417 5418 // Holds instructions to analyze. The instructions we visit are mapped in 5419 // ScalarCosts. Those instructions are the ones that would be scalarized if 5420 // we find that the scalar version costs less. 5421 SmallVector<Instruction *, 8> Worklist; 5422 5423 // Returns true if the given instruction can be scalarized. 5424 auto canBeScalarized = [&](Instruction *I) -> bool { 5425 // We only attempt to scalarize instructions forming a single-use chain 5426 // from the original predicated block that would otherwise be vectorized. 5427 // Although not strictly necessary, we give up on instructions we know will 5428 // already be scalar to avoid traversing chains that are unlikely to be 5429 // beneficial. 5430 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5431 isScalarAfterVectorization(I, VF)) 5432 return false; 5433 5434 // If the instruction is scalar with predication, it will be analyzed 5435 // separately. We ignore it within the context of PredInst. 5436 if (isScalarWithPredication(I)) 5437 return false; 5438 5439 // If any of the instruction's operands are uniform after vectorization, 5440 // the instruction cannot be scalarized. This prevents, for example, a 5441 // masked load from being scalarized. 5442 // 5443 // We assume we will only emit a value for lane zero of an instruction 5444 // marked uniform after vectorization, rather than VF identical values. 5445 // Thus, if we scalarize an instruction that uses a uniform, we would 5446 // create uses of values corresponding to the lanes we aren't emitting code 5447 // for. This behavior can be changed by allowing getScalarValue to clone 5448 // the lane zero values for uniforms rather than asserting. 5449 for (Use &U : I->operands()) 5450 if (auto *J = dyn_cast<Instruction>(U.get())) 5451 if (isUniformAfterVectorization(J, VF)) 5452 return false; 5453 5454 // Otherwise, we can scalarize the instruction. 5455 return true; 5456 }; 5457 5458 // Compute the expected cost discount from scalarizing the entire expression 5459 // feeding the predicated instruction. We currently only consider expressions 5460 // that are single-use instruction chains. 5461 Worklist.push_back(PredInst); 5462 while (!Worklist.empty()) { 5463 Instruction *I = Worklist.pop_back_val(); 5464 5465 // If we've already analyzed the instruction, there's nothing to do. 5466 if (ScalarCosts.find(I) != ScalarCosts.end()) 5467 continue; 5468 5469 // Compute the cost of the vector instruction. Note that this cost already 5470 // includes the scalarization overhead of the predicated instruction. 5471 unsigned VectorCost = getInstructionCost(I, VF).first; 5472 5473 // Compute the cost of the scalarized instruction. This cost is the cost of 5474 // the instruction as if it wasn't if-converted and instead remained in the 5475 // predicated block. We will scale this cost by block probability after 5476 // computing the scalarization overhead. 5477 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5478 5479 // Compute the scalarization overhead of needed insertelement instructions 5480 // and phi nodes. 5481 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5482 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), 5483 true, false); 5484 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); 5485 } 5486 5487 // Compute the scalarization overhead of needed extractelement 5488 // instructions. For each of the instruction's operands, if the operand can 5489 // be scalarized, add it to the worklist; otherwise, account for the 5490 // overhead. 5491 for (Use &U : I->operands()) 5492 if (auto *J = dyn_cast<Instruction>(U.get())) { 5493 assert(VectorType::isValidElementType(J->getType()) && 5494 "Instruction has non-scalar type"); 5495 if (canBeScalarized(J)) 5496 Worklist.push_back(J); 5497 else if (needsExtract(J, VF)) 5498 ScalarCost += TTI.getScalarizationOverhead( 5499 ToVectorTy(J->getType(),VF), false, true); 5500 } 5501 5502 // Scale the total scalar cost by block probability. 5503 ScalarCost /= getReciprocalPredBlockProb(); 5504 5505 // Compute the discount. A non-negative discount means the vector version 5506 // of the instruction costs more, and scalarizing would be beneficial. 5507 Discount += VectorCost - ScalarCost; 5508 ScalarCosts[I] = ScalarCost; 5509 } 5510 5511 return Discount; 5512 } 5513 5514 LoopVectorizationCostModel::VectorizationCostTy 5515 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5516 VectorizationCostTy Cost; 5517 5518 // For each block. 5519 for (BasicBlock *BB : TheLoop->blocks()) { 5520 VectorizationCostTy BlockCost; 5521 5522 // For each instruction in the old loop. 5523 for (Instruction &I : BB->instructionsWithoutDebug()) { 5524 // Skip ignored values. 5525 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || 5526 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) 5527 continue; 5528 5529 VectorizationCostTy C = getInstructionCost(&I, VF); 5530 5531 // Check if we should override the cost. 5532 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5533 C.first = ForceTargetInstructionCost; 5534 5535 BlockCost.first += C.first; 5536 BlockCost.second |= C.second; 5537 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5538 << " for VF " << VF << " For instruction: " << I 5539 << '\n'); 5540 } 5541 5542 // If we are vectorizing a predicated block, it will have been 5543 // if-converted. This means that the block's instructions (aside from 5544 // stores and instructions that may divide by zero) will now be 5545 // unconditionally executed. For the scalar case, we may not always execute 5546 // the predicated block. Thus, scale the block's cost by the probability of 5547 // executing it. 5548 if (VF == 1 && blockNeedsPredication(BB)) 5549 BlockCost.first /= getReciprocalPredBlockProb(); 5550 5551 Cost.first += BlockCost.first; 5552 Cost.second |= BlockCost.second; 5553 } 5554 5555 return Cost; 5556 } 5557 5558 /// Gets Address Access SCEV after verifying that the access pattern 5559 /// is loop invariant except the induction variable dependence. 5560 /// 5561 /// This SCEV can be sent to the Target in order to estimate the address 5562 /// calculation cost. 5563 static const SCEV *getAddressAccessSCEV( 5564 Value *Ptr, 5565 LoopVectorizationLegality *Legal, 5566 PredicatedScalarEvolution &PSE, 5567 const Loop *TheLoop) { 5568 5569 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5570 if (!Gep) 5571 return nullptr; 5572 5573 // We are looking for a gep with all loop invariant indices except for one 5574 // which should be an induction variable. 5575 auto SE = PSE.getSE(); 5576 unsigned NumOperands = Gep->getNumOperands(); 5577 for (unsigned i = 1; i < NumOperands; ++i) { 5578 Value *Opd = Gep->getOperand(i); 5579 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5580 !Legal->isInductionVariable(Opd)) 5581 return nullptr; 5582 } 5583 5584 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5585 return PSE.getSCEV(Ptr); 5586 } 5587 5588 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5589 return Legal->hasStride(I->getOperand(0)) || 5590 Legal->hasStride(I->getOperand(1)); 5591 } 5592 5593 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5594 unsigned VF) { 5595 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5596 Type *ValTy = getMemInstValueType(I); 5597 auto SE = PSE.getSE(); 5598 5599 unsigned Alignment = getLoadStoreAlignment(I); 5600 unsigned AS = getLoadStoreAddressSpace(I); 5601 Value *Ptr = getLoadStorePointerOperand(I); 5602 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5603 5604 // Figure out whether the access is strided and get the stride value 5605 // if it's known in compile time 5606 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5607 5608 // Get the cost of the scalar memory instruction and address computation. 5609 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5610 5611 // Don't pass *I here, since it is scalar but will actually be part of a 5612 // vectorized loop where the user of it is a vectorized instruction. 5613 Cost += VF * 5614 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 5615 AS); 5616 5617 // Get the overhead of the extractelement and insertelement instructions 5618 // we might create due to scalarization. 5619 Cost += getScalarizationOverhead(I, VF); 5620 5621 // If we have a predicated store, it may not be executed for each vector 5622 // lane. Scale the cost by the probability of executing the predicated 5623 // block. 5624 if (isPredicatedInst(I)) { 5625 Cost /= getReciprocalPredBlockProb(); 5626 5627 if (useEmulatedMaskMemRefHack(I)) 5628 // Artificially setting to a high enough value to practically disable 5629 // vectorization with such operations. 5630 Cost = 3000000; 5631 } 5632 5633 return Cost; 5634 } 5635 5636 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5637 unsigned VF) { 5638 Type *ValTy = getMemInstValueType(I); 5639 Type *VectorTy = ToVectorTy(ValTy, VF); 5640 unsigned Alignment = getLoadStoreAlignment(I); 5641 Value *Ptr = getLoadStorePointerOperand(I); 5642 unsigned AS = getLoadStoreAddressSpace(I); 5643 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5644 5645 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5646 "Stride should be 1 or -1 for consecutive memory access"); 5647 unsigned Cost = 0; 5648 if (Legal->isMaskRequired(I)) 5649 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 5650 else 5651 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); 5652 5653 bool Reverse = ConsecutiveStride < 0; 5654 if (Reverse) 5655 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5656 return Cost; 5657 } 5658 5659 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5660 unsigned VF) { 5661 Type *ValTy = getMemInstValueType(I); 5662 Type *VectorTy = ToVectorTy(ValTy, VF); 5663 unsigned Alignment = getLoadStoreAlignment(I); 5664 unsigned AS = getLoadStoreAddressSpace(I); 5665 if (isa<LoadInst>(I)) { 5666 return TTI.getAddressComputationCost(ValTy) + 5667 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + 5668 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5669 } 5670 StoreInst *SI = cast<StoreInst>(I); 5671 5672 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5673 return TTI.getAddressComputationCost(ValTy) + 5674 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + 5675 (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost( 5676 Instruction::ExtractElement, 5677 VectorTy, VF - 1)); 5678 } 5679 5680 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5681 unsigned VF) { 5682 Type *ValTy = getMemInstValueType(I); 5683 Type *VectorTy = ToVectorTy(ValTy, VF); 5684 unsigned Alignment = getLoadStoreAlignment(I); 5685 Value *Ptr = getLoadStorePointerOperand(I); 5686 5687 return TTI.getAddressComputationCost(VectorTy) + 5688 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5689 Legal->isMaskRequired(I), Alignment); 5690 } 5691 5692 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5693 unsigned VF) { 5694 Type *ValTy = getMemInstValueType(I); 5695 Type *VectorTy = ToVectorTy(ValTy, VF); 5696 unsigned AS = getLoadStoreAddressSpace(I); 5697 5698 auto Group = getInterleavedAccessGroup(I); 5699 assert(Group && "Fail to get an interleaved access group."); 5700 5701 unsigned InterleaveFactor = Group->getFactor(); 5702 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5703 5704 // Holds the indices of existing members in an interleaved load group. 5705 // An interleaved store group doesn't need this as it doesn't allow gaps. 5706 SmallVector<unsigned, 4> Indices; 5707 if (isa<LoadInst>(I)) { 5708 for (unsigned i = 0; i < InterleaveFactor; i++) 5709 if (Group->getMember(i)) 5710 Indices.push_back(i); 5711 } 5712 5713 // Calculate the cost of the whole interleaved group. 5714 bool UseMaskForGaps = 5715 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5716 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5717 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5718 Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); 5719 5720 if (Group->isReverse()) { 5721 // TODO: Add support for reversed masked interleaved access. 5722 assert(!Legal->isMaskRequired(I) && 5723 "Reverse masked interleaved access not supported."); 5724 Cost += Group->getNumMembers() * 5725 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5726 } 5727 return Cost; 5728 } 5729 5730 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5731 unsigned VF) { 5732 // Calculate scalar cost only. Vectorization cost should be ready at this 5733 // moment. 5734 if (VF == 1) { 5735 Type *ValTy = getMemInstValueType(I); 5736 unsigned Alignment = getLoadStoreAlignment(I); 5737 unsigned AS = getLoadStoreAddressSpace(I); 5738 5739 return TTI.getAddressComputationCost(ValTy) + 5740 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); 5741 } 5742 return getWideningCost(I, VF); 5743 } 5744 5745 LoopVectorizationCostModel::VectorizationCostTy 5746 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5747 // If we know that this instruction will remain uniform, check the cost of 5748 // the scalar version. 5749 if (isUniformAfterVectorization(I, VF)) 5750 VF = 1; 5751 5752 if (VF > 1 && isProfitableToScalarize(I, VF)) 5753 return VectorizationCostTy(InstsToScalarize[VF][I], false); 5754 5755 // Forced scalars do not have any scalarization overhead. 5756 auto ForcedScalar = ForcedScalars.find(VF); 5757 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 5758 auto InstSet = ForcedScalar->second; 5759 if (InstSet.find(I) != InstSet.end()) 5760 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 5761 } 5762 5763 Type *VectorTy; 5764 unsigned C = getInstructionCost(I, VF, VectorTy); 5765 5766 bool TypeNotScalarized = 5767 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 5768 return VectorizationCostTy(C, TypeNotScalarized); 5769 } 5770 5771 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 5772 unsigned VF) { 5773 5774 if (VF == 1) 5775 return 0; 5776 5777 unsigned Cost = 0; 5778 Type *RetTy = ToVectorTy(I->getType(), VF); 5779 if (!RetTy->isVoidTy() && 5780 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 5781 Cost += TTI.getScalarizationOverhead(RetTy, true, false); 5782 5783 // Some targets keep addresses scalar. 5784 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 5785 return Cost; 5786 5787 // Some targets support efficient element stores. 5788 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 5789 return Cost; 5790 5791 // Collect operands to consider. 5792 CallInst *CI = dyn_cast<CallInst>(I); 5793 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 5794 5795 // Skip operands that do not require extraction/scalarization and do not incur 5796 // any overhead. 5797 return Cost + TTI.getOperandsScalarizationOverhead( 5798 filterExtractingOperands(Ops, VF), VF); 5799 } 5800 5801 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 5802 if (VF == 1) 5803 return; 5804 NumPredStores = 0; 5805 for (BasicBlock *BB : TheLoop->blocks()) { 5806 // For each instruction in the old loop. 5807 for (Instruction &I : *BB) { 5808 Value *Ptr = getLoadStorePointerOperand(&I); 5809 if (!Ptr) 5810 continue; 5811 5812 // TODO: We should generate better code and update the cost model for 5813 // predicated uniform stores. Today they are treated as any other 5814 // predicated store (see added test cases in 5815 // invariant-store-vectorization.ll). 5816 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 5817 NumPredStores++; 5818 5819 if (Legal->isUniform(Ptr) && 5820 // Conditional loads and stores should be scalarized and predicated. 5821 // isScalarWithPredication cannot be used here since masked 5822 // gather/scatters are not considered scalar with predication. 5823 !Legal->blockNeedsPredication(I.getParent())) { 5824 // TODO: Avoid replicating loads and stores instead of 5825 // relying on instcombine to remove them. 5826 // Load: Scalar load + broadcast 5827 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 5828 unsigned Cost = getUniformMemOpCost(&I, VF); 5829 setWideningDecision(&I, VF, CM_Scalarize, Cost); 5830 continue; 5831 } 5832 5833 // We assume that widening is the best solution when possible. 5834 if (memoryInstructionCanBeWidened(&I, VF)) { 5835 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 5836 int ConsecutiveStride = 5837 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 5838 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5839 "Expected consecutive stride."); 5840 InstWidening Decision = 5841 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 5842 setWideningDecision(&I, VF, Decision, Cost); 5843 continue; 5844 } 5845 5846 // Choose between Interleaving, Gather/Scatter or Scalarization. 5847 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 5848 unsigned NumAccesses = 1; 5849 if (isAccessInterleaved(&I)) { 5850 auto Group = getInterleavedAccessGroup(&I); 5851 assert(Group && "Fail to get an interleaved access group."); 5852 5853 // Make one decision for the whole group. 5854 if (getWideningDecision(&I, VF) != CM_Unknown) 5855 continue; 5856 5857 NumAccesses = Group->getNumMembers(); 5858 if (interleavedAccessCanBeWidened(&I, VF)) 5859 InterleaveCost = getInterleaveGroupCost(&I, VF); 5860 } 5861 5862 unsigned GatherScatterCost = 5863 isLegalGatherOrScatter(&I) 5864 ? getGatherScatterCost(&I, VF) * NumAccesses 5865 : std::numeric_limits<unsigned>::max(); 5866 5867 unsigned ScalarizationCost = 5868 getMemInstScalarizationCost(&I, VF) * NumAccesses; 5869 5870 // Choose better solution for the current VF, 5871 // write down this decision and use it during vectorization. 5872 unsigned Cost; 5873 InstWidening Decision; 5874 if (InterleaveCost <= GatherScatterCost && 5875 InterleaveCost < ScalarizationCost) { 5876 Decision = CM_Interleave; 5877 Cost = InterleaveCost; 5878 } else if (GatherScatterCost < ScalarizationCost) { 5879 Decision = CM_GatherScatter; 5880 Cost = GatherScatterCost; 5881 } else { 5882 Decision = CM_Scalarize; 5883 Cost = ScalarizationCost; 5884 } 5885 // If the instructions belongs to an interleave group, the whole group 5886 // receives the same decision. The whole group receives the cost, but 5887 // the cost will actually be assigned to one instruction. 5888 if (auto Group = getInterleavedAccessGroup(&I)) 5889 setWideningDecision(Group, VF, Decision, Cost); 5890 else 5891 setWideningDecision(&I, VF, Decision, Cost); 5892 } 5893 } 5894 5895 // Make sure that any load of address and any other address computation 5896 // remains scalar unless there is gather/scatter support. This avoids 5897 // inevitable extracts into address registers, and also has the benefit of 5898 // activating LSR more, since that pass can't optimize vectorized 5899 // addresses. 5900 if (TTI.prefersVectorizedAddressing()) 5901 return; 5902 5903 // Start with all scalar pointer uses. 5904 SmallPtrSet<Instruction *, 8> AddrDefs; 5905 for (BasicBlock *BB : TheLoop->blocks()) 5906 for (Instruction &I : *BB) { 5907 Instruction *PtrDef = 5908 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 5909 if (PtrDef && TheLoop->contains(PtrDef) && 5910 getWideningDecision(&I, VF) != CM_GatherScatter) 5911 AddrDefs.insert(PtrDef); 5912 } 5913 5914 // Add all instructions used to generate the addresses. 5915 SmallVector<Instruction *, 4> Worklist; 5916 for (auto *I : AddrDefs) 5917 Worklist.push_back(I); 5918 while (!Worklist.empty()) { 5919 Instruction *I = Worklist.pop_back_val(); 5920 for (auto &Op : I->operands()) 5921 if (auto *InstOp = dyn_cast<Instruction>(Op)) 5922 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 5923 AddrDefs.insert(InstOp).second) 5924 Worklist.push_back(InstOp); 5925 } 5926 5927 for (auto *I : AddrDefs) { 5928 if (isa<LoadInst>(I)) { 5929 // Setting the desired widening decision should ideally be handled in 5930 // by cost functions, but since this involves the task of finding out 5931 // if the loaded register is involved in an address computation, it is 5932 // instead changed here when we know this is the case. 5933 InstWidening Decision = getWideningDecision(I, VF); 5934 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 5935 // Scalarize a widened load of address. 5936 setWideningDecision(I, VF, CM_Scalarize, 5937 (VF * getMemoryInstructionCost(I, 1))); 5938 else if (auto Group = getInterleavedAccessGroup(I)) { 5939 // Scalarize an interleave group of address loads. 5940 for (unsigned I = 0; I < Group->getFactor(); ++I) { 5941 if (Instruction *Member = Group->getMember(I)) 5942 setWideningDecision(Member, VF, CM_Scalarize, 5943 (VF * getMemoryInstructionCost(Member, 1))); 5944 } 5945 } 5946 } else 5947 // Make sure I gets scalarized and a cost estimate without 5948 // scalarization overhead. 5949 ForcedScalars[VF].insert(I); 5950 } 5951 } 5952 5953 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 5954 unsigned VF, 5955 Type *&VectorTy) { 5956 Type *RetTy = I->getType(); 5957 if (canTruncateToMinimalBitwidth(I, VF)) 5958 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 5959 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 5960 auto SE = PSE.getSE(); 5961 5962 // TODO: We need to estimate the cost of intrinsic calls. 5963 switch (I->getOpcode()) { 5964 case Instruction::GetElementPtr: 5965 // We mark this instruction as zero-cost because the cost of GEPs in 5966 // vectorized code depends on whether the corresponding memory instruction 5967 // is scalarized or not. Therefore, we handle GEPs with the memory 5968 // instruction cost. 5969 return 0; 5970 case Instruction::Br: { 5971 // In cases of scalarized and predicated instructions, there will be VF 5972 // predicated blocks in the vectorized loop. Each branch around these 5973 // blocks requires also an extract of its vector compare i1 element. 5974 bool ScalarPredicatedBB = false; 5975 BranchInst *BI = cast<BranchInst>(I); 5976 if (VF > 1 && BI->isConditional() && 5977 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != 5978 PredicatedBBsAfterVectorization.end() || 5979 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != 5980 PredicatedBBsAfterVectorization.end())) 5981 ScalarPredicatedBB = true; 5982 5983 if (ScalarPredicatedBB) { 5984 // Return cost for branches around scalarized and predicated blocks. 5985 Type *Vec_i1Ty = 5986 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 5987 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + 5988 (TTI.getCFInstrCost(Instruction::Br) * VF)); 5989 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 5990 // The back-edge branch will remain, as will all scalar branches. 5991 return TTI.getCFInstrCost(Instruction::Br); 5992 else 5993 // This branch will be eliminated by if-conversion. 5994 return 0; 5995 // Note: We currently assume zero cost for an unconditional branch inside 5996 // a predicated block since it will become a fall-through, although we 5997 // may decide in the future to call TTI for all branches. 5998 } 5999 case Instruction::PHI: { 6000 auto *Phi = cast<PHINode>(I); 6001 6002 // First-order recurrences are replaced by vector shuffles inside the loop. 6003 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6004 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6005 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6006 VectorTy, VF - 1, VectorType::get(RetTy, 1)); 6007 6008 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6009 // converted into select instructions. We require N - 1 selects per phi 6010 // node, where N is the number of incoming values. 6011 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6012 return (Phi->getNumIncomingValues() - 1) * 6013 TTI.getCmpSelInstrCost( 6014 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6015 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); 6016 6017 return TTI.getCFInstrCost(Instruction::PHI); 6018 } 6019 case Instruction::UDiv: 6020 case Instruction::SDiv: 6021 case Instruction::URem: 6022 case Instruction::SRem: 6023 // If we have a predicated instruction, it may not be executed for each 6024 // vector lane. Get the scalarization cost and scale this amount by the 6025 // probability of executing the predicated block. If the instruction is not 6026 // predicated, we fall through to the next case. 6027 if (VF > 1 && isScalarWithPredication(I)) { 6028 unsigned Cost = 0; 6029 6030 // These instructions have a non-void type, so account for the phi nodes 6031 // that we will create. This cost is likely to be zero. The phi node 6032 // cost, if any, should be scaled by the block probability because it 6033 // models a copy at the end of each predicated block. 6034 Cost += VF * TTI.getCFInstrCost(Instruction::PHI); 6035 6036 // The cost of the non-predicated instruction. 6037 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); 6038 6039 // The cost of insertelement and extractelement instructions needed for 6040 // scalarization. 6041 Cost += getScalarizationOverhead(I, VF); 6042 6043 // Scale the cost by the probability of executing the predicated blocks. 6044 // This assumes the predicated block for each vector lane is equally 6045 // likely. 6046 return Cost / getReciprocalPredBlockProb(); 6047 } 6048 LLVM_FALLTHROUGH; 6049 case Instruction::Add: 6050 case Instruction::FAdd: 6051 case Instruction::Sub: 6052 case Instruction::FSub: 6053 case Instruction::Mul: 6054 case Instruction::FMul: 6055 case Instruction::FDiv: 6056 case Instruction::FRem: 6057 case Instruction::Shl: 6058 case Instruction::LShr: 6059 case Instruction::AShr: 6060 case Instruction::And: 6061 case Instruction::Or: 6062 case Instruction::Xor: { 6063 // Since we will replace the stride by 1 the multiplication should go away. 6064 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6065 return 0; 6066 // Certain instructions can be cheaper to vectorize if they have a constant 6067 // second vector operand. One example of this are shifts on x86. 6068 Value *Op2 = I->getOperand(1); 6069 TargetTransformInfo::OperandValueProperties Op2VP; 6070 TargetTransformInfo::OperandValueKind Op2VK = 6071 TTI.getOperandInfo(Op2, Op2VP); 6072 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6073 Op2VK = TargetTransformInfo::OK_UniformValue; 6074 6075 SmallVector<const Value *, 4> Operands(I->operand_values()); 6076 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6077 return N * TTI.getArithmeticInstrCost( 6078 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6079 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands); 6080 } 6081 case Instruction::FNeg: { 6082 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6083 return N * TTI.getArithmeticInstrCost( 6084 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6085 TargetTransformInfo::OK_AnyValue, 6086 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6087 I->getOperand(0)); 6088 } 6089 case Instruction::Select: { 6090 SelectInst *SI = cast<SelectInst>(I); 6091 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6092 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6093 Type *CondTy = SI->getCondition()->getType(); 6094 if (!ScalarCond) 6095 CondTy = VectorType::get(CondTy, VF); 6096 6097 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); 6098 } 6099 case Instruction::ICmp: 6100 case Instruction::FCmp: { 6101 Type *ValTy = I->getOperand(0)->getType(); 6102 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6103 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6104 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6105 VectorTy = ToVectorTy(ValTy, VF); 6106 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); 6107 } 6108 case Instruction::Store: 6109 case Instruction::Load: { 6110 unsigned Width = VF; 6111 if (Width > 1) { 6112 InstWidening Decision = getWideningDecision(I, Width); 6113 assert(Decision != CM_Unknown && 6114 "CM decision should be taken at this point"); 6115 if (Decision == CM_Scalarize) 6116 Width = 1; 6117 } 6118 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6119 return getMemoryInstructionCost(I, VF); 6120 } 6121 case Instruction::ZExt: 6122 case Instruction::SExt: 6123 case Instruction::FPToUI: 6124 case Instruction::FPToSI: 6125 case Instruction::FPExt: 6126 case Instruction::PtrToInt: 6127 case Instruction::IntToPtr: 6128 case Instruction::SIToFP: 6129 case Instruction::UIToFP: 6130 case Instruction::Trunc: 6131 case Instruction::FPTrunc: 6132 case Instruction::BitCast: { 6133 // We optimize the truncation of induction variables having constant 6134 // integer steps. The cost of these truncations is the same as the scalar 6135 // operation. 6136 if (isOptimizableIVTruncate(I, VF)) { 6137 auto *Trunc = cast<TruncInst>(I); 6138 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6139 Trunc->getSrcTy(), Trunc); 6140 } 6141 6142 Type *SrcScalarTy = I->getOperand(0)->getType(); 6143 Type *SrcVecTy = 6144 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6145 if (canTruncateToMinimalBitwidth(I, VF)) { 6146 // This cast is going to be shrunk. This may remove the cast or it might 6147 // turn it into slightly different cast. For example, if MinBW == 16, 6148 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6149 // 6150 // Calculate the modified src and dest types. 6151 Type *MinVecTy = VectorTy; 6152 if (I->getOpcode() == Instruction::Trunc) { 6153 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6154 VectorTy = 6155 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6156 } else if (I->getOpcode() == Instruction::ZExt || 6157 I->getOpcode() == Instruction::SExt) { 6158 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6159 VectorTy = 6160 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6161 } 6162 } 6163 6164 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6165 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); 6166 } 6167 case Instruction::Call: { 6168 bool NeedToScalarize; 6169 CallInst *CI = cast<CallInst>(I); 6170 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6171 if (getVectorIntrinsicIDForCall(CI, TLI)) 6172 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6173 return CallCost; 6174 } 6175 default: 6176 // The cost of executing VF copies of the scalar instruction. This opcode 6177 // is unknown. Assume that it is the same as 'mul'. 6178 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + 6179 getScalarizationOverhead(I, VF); 6180 } // end of switch. 6181 } 6182 6183 char LoopVectorize::ID = 0; 6184 6185 static const char lv_name[] = "Loop Vectorization"; 6186 6187 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6188 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6189 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6190 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6191 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6192 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6193 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6194 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6195 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6196 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6197 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6198 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6199 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6200 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6201 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6202 6203 namespace llvm { 6204 6205 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6206 6207 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6208 bool VectorizeOnlyWhenForced) { 6209 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6210 } 6211 6212 } // end namespace llvm 6213 6214 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6215 // Check if the pointer operand of a load or store instruction is 6216 // consecutive. 6217 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6218 return Legal->isConsecutivePtr(Ptr); 6219 return false; 6220 } 6221 6222 void LoopVectorizationCostModel::collectValuesToIgnore() { 6223 // Ignore ephemeral values. 6224 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6225 6226 // Ignore type-promoting instructions we identified during reduction 6227 // detection. 6228 for (auto &Reduction : *Legal->getReductionVars()) { 6229 RecurrenceDescriptor &RedDes = Reduction.second; 6230 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6231 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6232 } 6233 // Ignore type-casting instructions we identified during induction 6234 // detection. 6235 for (auto &Induction : *Legal->getInductionVars()) { 6236 InductionDescriptor &IndDes = Induction.second; 6237 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6238 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6239 } 6240 } 6241 6242 // TODO: we could return a pair of values that specify the max VF and 6243 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6244 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6245 // doesn't have a cost model that can choose which plan to execute if 6246 // more than one is generated. 6247 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6248 LoopVectorizationCostModel &CM) { 6249 unsigned WidestType; 6250 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6251 return WidestVectorRegBits / WidestType; 6252 } 6253 6254 VectorizationFactor 6255 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6256 unsigned VF = UserVF; 6257 // Outer loop handling: They may require CFG and instruction level 6258 // transformations before even evaluating whether vectorization is profitable. 6259 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6260 // the vectorization pipeline. 6261 if (!OrigLoop->empty()) { 6262 // If the user doesn't provide a vectorization factor, determine a 6263 // reasonable one. 6264 if (!UserVF) { 6265 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6266 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6267 6268 // Make sure we have a VF > 1 for stress testing. 6269 if (VPlanBuildStressTest && VF < 2) { 6270 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6271 << "overriding computed VF.\n"); 6272 VF = 4; 6273 } 6274 } 6275 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6276 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6277 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6278 << " to build VPlans.\n"); 6279 buildVPlans(VF, VF); 6280 6281 // For VPlan build stress testing, we bail out after VPlan construction. 6282 if (VPlanBuildStressTest) 6283 return VectorizationFactor::Disabled(); 6284 6285 return {VF, 0}; 6286 } 6287 6288 LLVM_DEBUG( 6289 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6290 "VPlan-native path.\n"); 6291 return VectorizationFactor::Disabled(); 6292 } 6293 6294 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { 6295 assert(OrigLoop->empty() && "Inner loop expected."); 6296 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); 6297 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6298 return None; 6299 6300 // Invalidate interleave groups if all blocks of loop will be predicated. 6301 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6302 !useMaskedInterleavedAccesses(*TTI)) { 6303 LLVM_DEBUG( 6304 dbgs() 6305 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6306 "which requires masked-interleaved support.\n"); 6307 CM.InterleaveInfo.reset(); 6308 } 6309 6310 if (UserVF) { 6311 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6312 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6313 // Collect the instructions (and their associated costs) that will be more 6314 // profitable to scalarize. 6315 CM.selectUserVectorizationFactor(UserVF); 6316 buildVPlansWithVPRecipes(UserVF, UserVF); 6317 LLVM_DEBUG(printPlans(dbgs())); 6318 return {{UserVF, 0}}; 6319 } 6320 6321 unsigned MaxVF = MaybeMaxVF.getValue(); 6322 assert(MaxVF != 0 && "MaxVF is zero."); 6323 6324 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6325 // Collect Uniform and Scalar instructions after vectorization with VF. 6326 CM.collectUniformsAndScalars(VF); 6327 6328 // Collect the instructions (and their associated costs) that will be more 6329 // profitable to scalarize. 6330 if (VF > 1) 6331 CM.collectInstsToScalarize(VF); 6332 } 6333 6334 buildVPlansWithVPRecipes(1, MaxVF); 6335 LLVM_DEBUG(printPlans(dbgs())); 6336 if (MaxVF == 1) 6337 return VectorizationFactor::Disabled(); 6338 6339 // Select the optimal vectorization factor. 6340 return CM.selectVectorizationFactor(MaxVF); 6341 } 6342 6343 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6344 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6345 << '\n'); 6346 BestVF = VF; 6347 BestUF = UF; 6348 6349 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6350 return !Plan->hasVF(VF); 6351 }); 6352 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6353 } 6354 6355 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6356 DominatorTree *DT) { 6357 // Perform the actual loop transformation. 6358 6359 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6360 VPCallbackILV CallbackILV(ILV); 6361 6362 VPTransformState State{BestVF, BestUF, LI, 6363 DT, ILV.Builder, ILV.VectorLoopValueMap, 6364 &ILV, CallbackILV}; 6365 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6366 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6367 6368 //===------------------------------------------------===// 6369 // 6370 // Notice: any optimization or new instruction that go 6371 // into the code below should also be implemented in 6372 // the cost-model. 6373 // 6374 //===------------------------------------------------===// 6375 6376 // 2. Copy and widen instructions from the old loop into the new loop. 6377 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6378 VPlans.front()->execute(&State); 6379 6380 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6381 // predication, updating analyses. 6382 ILV.fixVectorizedLoop(); 6383 } 6384 6385 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6386 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6387 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6388 6389 // We create new control-flow for the vectorized loop, so the original 6390 // condition will be dead after vectorization if it's only used by the 6391 // branch. 6392 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6393 if (Cmp && Cmp->hasOneUse()) 6394 DeadInstructions.insert(Cmp); 6395 6396 // We create new "steps" for induction variable updates to which the original 6397 // induction variables map. An original update instruction will be dead if 6398 // all its users except the induction variable are dead. 6399 for (auto &Induction : *Legal->getInductionVars()) { 6400 PHINode *Ind = Induction.first; 6401 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6402 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6403 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != 6404 DeadInstructions.end(); 6405 })) 6406 DeadInstructions.insert(IndUpdate); 6407 6408 // We record as "Dead" also the type-casting instructions we had identified 6409 // during induction analysis. We don't need any handling for them in the 6410 // vectorized loop because we have proven that, under a proper runtime 6411 // test guarding the vectorized loop, the value of the phi, and the casted 6412 // value of the phi, are the same. The last instruction in this casting chain 6413 // will get its scalar/vector/widened def from the scalar/vector/widened def 6414 // of the respective phi node. Any other casts in the induction def-use chain 6415 // have no other uses outside the phi update chain, and will be ignored. 6416 InductionDescriptor &IndDes = Induction.second; 6417 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6418 DeadInstructions.insert(Casts.begin(), Casts.end()); 6419 } 6420 } 6421 6422 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6423 6424 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6425 6426 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6427 Instruction::BinaryOps BinOp) { 6428 // When unrolling and the VF is 1, we only need to add a simple scalar. 6429 Type *Ty = Val->getType(); 6430 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6431 6432 if (Ty->isFloatingPointTy()) { 6433 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6434 6435 // Floating point operations had to be 'fast' to enable the unrolling. 6436 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6437 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6438 } 6439 Constant *C = ConstantInt::get(Ty, StartIdx); 6440 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6441 } 6442 6443 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6444 SmallVector<Metadata *, 4> MDs; 6445 // Reserve first location for self reference to the LoopID metadata node. 6446 MDs.push_back(nullptr); 6447 bool IsUnrollMetadata = false; 6448 MDNode *LoopID = L->getLoopID(); 6449 if (LoopID) { 6450 // First find existing loop unrolling disable metadata. 6451 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6452 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6453 if (MD) { 6454 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6455 IsUnrollMetadata = 6456 S && S->getString().startswith("llvm.loop.unroll.disable"); 6457 } 6458 MDs.push_back(LoopID->getOperand(i)); 6459 } 6460 } 6461 6462 if (!IsUnrollMetadata) { 6463 // Add runtime unroll disable metadata. 6464 LLVMContext &Context = L->getHeader()->getContext(); 6465 SmallVector<Metadata *, 1> DisableOperands; 6466 DisableOperands.push_back( 6467 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6468 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6469 MDs.push_back(DisableNode); 6470 MDNode *NewLoopID = MDNode::get(Context, MDs); 6471 // Set operand 0 to refer to the loop id itself. 6472 NewLoopID->replaceOperandWith(0, NewLoopID); 6473 L->setLoopID(NewLoopID); 6474 } 6475 } 6476 6477 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6478 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6479 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6480 bool PredicateAtRangeStart = Predicate(Range.Start); 6481 6482 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6483 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6484 Range.End = TmpVF; 6485 break; 6486 } 6487 6488 return PredicateAtRangeStart; 6489 } 6490 6491 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6492 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6493 /// of VF's starting at a given VF and extending it as much as possible. Each 6494 /// vectorization decision can potentially shorten this sub-range during 6495 /// buildVPlan(). 6496 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6497 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6498 VFRange SubRange = {VF, MaxVF + 1}; 6499 VPlans.push_back(buildVPlan(SubRange)); 6500 VF = SubRange.End; 6501 } 6502 } 6503 6504 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6505 VPlanPtr &Plan) { 6506 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6507 6508 // Look for cached value. 6509 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6510 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6511 if (ECEntryIt != EdgeMaskCache.end()) 6512 return ECEntryIt->second; 6513 6514 VPValue *SrcMask = createBlockInMask(Src, Plan); 6515 6516 // The terminator has to be a branch inst! 6517 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6518 assert(BI && "Unexpected terminator found"); 6519 6520 if (!BI->isConditional()) 6521 return EdgeMaskCache[Edge] = SrcMask; 6522 6523 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6524 assert(EdgeMask && "No Edge Mask found for condition"); 6525 6526 if (BI->getSuccessor(0) != Dst) 6527 EdgeMask = Builder.createNot(EdgeMask); 6528 6529 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6530 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6531 6532 return EdgeMaskCache[Edge] = EdgeMask; 6533 } 6534 6535 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6536 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6537 6538 // Look for cached value. 6539 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6540 if (BCEntryIt != BlockMaskCache.end()) 6541 return BCEntryIt->second; 6542 6543 // All-one mask is modelled as no-mask following the convention for masked 6544 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6545 VPValue *BlockMask = nullptr; 6546 6547 if (OrigLoop->getHeader() == BB) { 6548 if (!CM.blockNeedsPredication(BB)) 6549 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6550 6551 // Introduce the early-exit compare IV <= BTC to form header block mask. 6552 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6553 VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6554 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6555 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6556 return BlockMaskCache[BB] = BlockMask; 6557 } 6558 6559 // This is the block mask. We OR all incoming edges. 6560 for (auto *Predecessor : predecessors(BB)) { 6561 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6562 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6563 return BlockMaskCache[BB] = EdgeMask; 6564 6565 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6566 BlockMask = EdgeMask; 6567 continue; 6568 } 6569 6570 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6571 } 6572 6573 return BlockMaskCache[BB] = BlockMask; 6574 } 6575 6576 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, 6577 VFRange &Range, 6578 VPlanPtr &Plan) { 6579 const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I); 6580 if (!IG) 6581 return nullptr; 6582 6583 // Now check if IG is relevant for VF's in the given range. 6584 auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> { 6585 return [=](unsigned VF) -> bool { 6586 return (VF >= 2 && // Query is illegal for VF == 1 6587 CM.getWideningDecision(I, VF) == 6588 LoopVectorizationCostModel::CM_Interleave); 6589 }; 6590 }; 6591 if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range)) 6592 return nullptr; 6593 6594 // I is a member of an InterleaveGroup for VF's in the (possibly trimmed) 6595 // range. If it's the primary member of the IG construct a VPInterleaveRecipe. 6596 // Otherwise, it's an adjunct member of the IG, do not construct any Recipe. 6597 assert(I == IG->getInsertPos() && 6598 "Generating a recipe for an adjunct member of an interleave group"); 6599 6600 VPValue *Mask = nullptr; 6601 if (Legal->isMaskRequired(I)) 6602 Mask = createBlockInMask(I->getParent(), Plan); 6603 6604 return new VPInterleaveRecipe(IG, Mask); 6605 } 6606 6607 VPWidenMemoryInstructionRecipe * 6608 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6609 VPlanPtr &Plan) { 6610 if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) 6611 return nullptr; 6612 6613 auto willWiden = [&](unsigned VF) -> bool { 6614 if (VF == 1) 6615 return false; 6616 if (CM.isScalarAfterVectorization(I, VF) || 6617 CM.isProfitableToScalarize(I, VF)) 6618 return false; 6619 LoopVectorizationCostModel::InstWidening Decision = 6620 CM.getWideningDecision(I, VF); 6621 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6622 "CM decision should be taken at this point."); 6623 assert(Decision != LoopVectorizationCostModel::CM_Interleave && 6624 "Interleave memory opportunity should be caught earlier."); 6625 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6626 }; 6627 6628 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6629 return nullptr; 6630 6631 VPValue *Mask = nullptr; 6632 if (Legal->isMaskRequired(I)) 6633 Mask = createBlockInMask(I->getParent(), Plan); 6634 6635 return new VPWidenMemoryInstructionRecipe(*I, Mask); 6636 } 6637 6638 VPWidenIntOrFpInductionRecipe * 6639 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { 6640 if (PHINode *Phi = dyn_cast<PHINode>(I)) { 6641 // Check if this is an integer or fp induction. If so, build the recipe that 6642 // produces its scalar and vector values. 6643 InductionDescriptor II = Legal->getInductionVars()->lookup(Phi); 6644 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6645 II.getKind() == InductionDescriptor::IK_FpInduction) 6646 return new VPWidenIntOrFpInductionRecipe(Phi); 6647 6648 return nullptr; 6649 } 6650 6651 // Optimize the special case where the source is a constant integer 6652 // induction variable. Notice that we can only optimize the 'trunc' case 6653 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6654 // (c) other casts depend on pointer size. 6655 6656 // Determine whether \p K is a truncation based on an induction variable that 6657 // can be optimized. 6658 auto isOptimizableIVTruncate = 6659 [&](Instruction *K) -> std::function<bool(unsigned)> { 6660 return 6661 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6662 }; 6663 6664 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( 6665 isOptimizableIVTruncate(I), Range)) 6666 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6667 cast<TruncInst>(I)); 6668 return nullptr; 6669 } 6670 6671 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { 6672 PHINode *Phi = dyn_cast<PHINode>(I); 6673 if (!Phi || Phi->getParent() == OrigLoop->getHeader()) 6674 return nullptr; 6675 6676 // We know that all PHIs in non-header blocks are converted into selects, so 6677 // we don't have to worry about the insertion order and we can just use the 6678 // builder. At this point we generate the predication tree. There may be 6679 // duplications since this is a simple recursive scan, but future 6680 // optimizations will clean it up. 6681 6682 SmallVector<VPValue *, 2> Masks; 6683 unsigned NumIncoming = Phi->getNumIncomingValues(); 6684 for (unsigned In = 0; In < NumIncoming; In++) { 6685 VPValue *EdgeMask = 6686 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6687 assert((EdgeMask || NumIncoming == 1) && 6688 "Multiple predecessors with one having a full mask"); 6689 if (EdgeMask) 6690 Masks.push_back(EdgeMask); 6691 } 6692 return new VPBlendRecipe(Phi, Masks); 6693 } 6694 6695 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, 6696 VFRange &Range) { 6697 6698 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6699 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6700 6701 if (IsPredicated) 6702 return false; 6703 6704 auto IsVectorizableOpcode = [](unsigned Opcode) { 6705 switch (Opcode) { 6706 case Instruction::Add: 6707 case Instruction::And: 6708 case Instruction::AShr: 6709 case Instruction::BitCast: 6710 case Instruction::Br: 6711 case Instruction::Call: 6712 case Instruction::FAdd: 6713 case Instruction::FCmp: 6714 case Instruction::FDiv: 6715 case Instruction::FMul: 6716 case Instruction::FNeg: 6717 case Instruction::FPExt: 6718 case Instruction::FPToSI: 6719 case Instruction::FPToUI: 6720 case Instruction::FPTrunc: 6721 case Instruction::FRem: 6722 case Instruction::FSub: 6723 case Instruction::GetElementPtr: 6724 case Instruction::ICmp: 6725 case Instruction::IntToPtr: 6726 case Instruction::Load: 6727 case Instruction::LShr: 6728 case Instruction::Mul: 6729 case Instruction::Or: 6730 case Instruction::PHI: 6731 case Instruction::PtrToInt: 6732 case Instruction::SDiv: 6733 case Instruction::Select: 6734 case Instruction::SExt: 6735 case Instruction::Shl: 6736 case Instruction::SIToFP: 6737 case Instruction::SRem: 6738 case Instruction::Store: 6739 case Instruction::Sub: 6740 case Instruction::Trunc: 6741 case Instruction::UDiv: 6742 case Instruction::UIToFP: 6743 case Instruction::URem: 6744 case Instruction::Xor: 6745 case Instruction::ZExt: 6746 return true; 6747 } 6748 return false; 6749 }; 6750 6751 if (!IsVectorizableOpcode(I->getOpcode())) 6752 return false; 6753 6754 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6755 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6756 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6757 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6758 return false; 6759 } 6760 6761 auto willWiden = [&](unsigned VF) -> bool { 6762 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || 6763 CM.isProfitableToScalarize(I, VF))) 6764 return false; 6765 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6766 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6767 // The following case may be scalarized depending on the VF. 6768 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6769 // version of the instruction. 6770 // Is it beneficial to perform intrinsic call compared to lib call? 6771 bool NeedToScalarize; 6772 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6773 bool UseVectorIntrinsic = 6774 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6775 return UseVectorIntrinsic || !NeedToScalarize; 6776 } 6777 if (isa<LoadInst>(I) || isa<StoreInst>(I)) { 6778 assert(CM.getWideningDecision(I, VF) == 6779 LoopVectorizationCostModel::CM_Scalarize && 6780 "Memory widening decisions should have been taken care by now"); 6781 return false; 6782 } 6783 return true; 6784 }; 6785 6786 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6787 return false; 6788 6789 // Success: widen this instruction. We optimize the common case where 6790 // consecutive instructions can be represented by a single recipe. 6791 if (!VPBB->empty()) { 6792 VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back()); 6793 if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I)) 6794 return true; 6795 } 6796 6797 VPBB->appendRecipe(new VPWidenRecipe(I)); 6798 return true; 6799 } 6800 6801 VPBasicBlock *VPRecipeBuilder::handleReplication( 6802 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 6803 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 6804 VPlanPtr &Plan) { 6805 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 6806 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 6807 Range); 6808 6809 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6810 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6811 6812 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); 6813 6814 // Find if I uses a predicated instruction. If so, it will use its scalar 6815 // value. Avoid hoisting the insert-element which packs the scalar value into 6816 // a vector value, as that happens iff all users use the vector value. 6817 for (auto &Op : I->operands()) 6818 if (auto *PredInst = dyn_cast<Instruction>(Op)) 6819 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 6820 PredInst2Recipe[PredInst]->setAlsoPack(false); 6821 6822 // Finalize the recipe for Instr, first if it is not predicated. 6823 if (!IsPredicated) { 6824 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 6825 VPBB->appendRecipe(Recipe); 6826 return VPBB; 6827 } 6828 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 6829 assert(VPBB->getSuccessors().empty() && 6830 "VPBB has successors when handling predicated replication."); 6831 // Record predicated instructions for above packing optimizations. 6832 PredInst2Recipe[I] = Recipe; 6833 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 6834 VPBlockUtils::insertBlockAfter(Region, VPBB); 6835 auto *RegSucc = new VPBasicBlock(); 6836 VPBlockUtils::insertBlockAfter(RegSucc, Region); 6837 return RegSucc; 6838 } 6839 6840 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 6841 VPRecipeBase *PredRecipe, 6842 VPlanPtr &Plan) { 6843 // Instructions marked for predication are replicated and placed under an 6844 // if-then construct to prevent side-effects. 6845 6846 // Generate recipes to compute the block mask for this region. 6847 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 6848 6849 // Build the triangular if-then region. 6850 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 6851 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 6852 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 6853 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 6854 auto *PHIRecipe = 6855 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 6856 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 6857 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 6858 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 6859 6860 // Note: first set Entry as region entry and then connect successors starting 6861 // from it in order, to propagate the "parent" of each VPBasicBlock. 6862 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 6863 VPBlockUtils::connectBlocks(Pred, Exit); 6864 6865 return Region; 6866 } 6867 6868 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, 6869 VPlanPtr &Plan, VPBasicBlock *VPBB) { 6870 VPRecipeBase *Recipe = nullptr; 6871 // Check if Instr should belong to an interleave memory recipe, or already 6872 // does. In the latter case Instr is irrelevant. 6873 if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) { 6874 VPBB->appendRecipe(Recipe); 6875 return true; 6876 } 6877 6878 // Check if Instr is a memory operation that should be widened. 6879 if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) { 6880 VPBB->appendRecipe(Recipe); 6881 return true; 6882 } 6883 6884 // Check if Instr should form some PHI recipe. 6885 if ((Recipe = tryToOptimizeInduction(Instr, Range))) { 6886 VPBB->appendRecipe(Recipe); 6887 return true; 6888 } 6889 if ((Recipe = tryToBlend(Instr, Plan))) { 6890 VPBB->appendRecipe(Recipe); 6891 return true; 6892 } 6893 if (PHINode *Phi = dyn_cast<PHINode>(Instr)) { 6894 VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); 6895 return true; 6896 } 6897 6898 // Check if Instr is to be widened by a general VPWidenRecipe, after 6899 // having first checked for specific widening recipes that deal with 6900 // Interleave Groups, Inductions and Phi nodes. 6901 if (tryToWiden(Instr, VPBB, Range)) 6902 return true; 6903 6904 return false; 6905 } 6906 6907 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 6908 unsigned MaxVF) { 6909 assert(OrigLoop->empty() && "Inner loop expected."); 6910 6911 // Collect conditions feeding internal conditional branches; they need to be 6912 // represented in VPlan for it to model masking. 6913 SmallPtrSet<Value *, 1> NeedDef; 6914 6915 auto *Latch = OrigLoop->getLoopLatch(); 6916 for (BasicBlock *BB : OrigLoop->blocks()) { 6917 if (BB == Latch) 6918 continue; 6919 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 6920 if (Branch && Branch->isConditional()) 6921 NeedDef.insert(Branch->getCondition()); 6922 } 6923 6924 // If the tail is to be folded by masking, the primary induction variable 6925 // needs to be represented in VPlan for it to model early-exit masking. 6926 if (CM.foldTailByMasking()) 6927 NeedDef.insert(Legal->getPrimaryInduction()); 6928 6929 // Collect instructions from the original loop that will become trivially dead 6930 // in the vectorized loop. We don't need to vectorize these instructions. For 6931 // example, original induction update instructions can become dead because we 6932 // separately emit induction "steps" when generating code for the new loop. 6933 // Similarly, we create a new latch condition when setting up the structure 6934 // of the new loop, so the old one can become dead. 6935 SmallPtrSet<Instruction *, 4> DeadInstructions; 6936 collectTriviallyDeadInstructions(DeadInstructions); 6937 6938 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6939 VFRange SubRange = {VF, MaxVF + 1}; 6940 VPlans.push_back( 6941 buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions)); 6942 VF = SubRange.End; 6943 } 6944 } 6945 6946 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 6947 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 6948 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6949 // Hold a mapping from predicated instructions to their recipes, in order to 6950 // fix their AlsoPack behavior if a user is determined to replicate and use a 6951 // scalar instead of vector value. 6952 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 6953 6954 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 6955 DenseMap<Instruction *, Instruction *> SinkAfterInverse; 6956 6957 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 6958 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 6959 auto Plan = llvm::make_unique<VPlan>(VPBB); 6960 6961 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); 6962 // Represent values that will have defs inside VPlan. 6963 for (Value *V : NeedDef) 6964 Plan->addVPValue(V); 6965 6966 // Scan the body of the loop in a topological order to visit each basic block 6967 // after having visited its predecessor basic blocks. 6968 LoopBlocksDFS DFS(OrigLoop); 6969 DFS.perform(LI); 6970 6971 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6972 // Relevant instructions from basic block BB will be grouped into VPRecipe 6973 // ingredients and fill a new VPBasicBlock. 6974 unsigned VPBBsForBB = 0; 6975 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 6976 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 6977 VPBB = FirstVPBBForBB; 6978 Builder.setInsertPoint(VPBB); 6979 6980 std::vector<Instruction *> Ingredients; 6981 6982 // Organize the ingredients to vectorize from current basic block in the 6983 // right order. 6984 for (Instruction &I : BB->instructionsWithoutDebug()) { 6985 Instruction *Instr = &I; 6986 6987 // First filter out irrelevant instructions, to ensure no recipes are 6988 // built for them. 6989 if (isa<BranchInst>(Instr) || 6990 DeadInstructions.find(Instr) != DeadInstructions.end()) 6991 continue; 6992 6993 // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct 6994 // member of the IG, do not construct any Recipe for it. 6995 const InterleaveGroup<Instruction> *IG = 6996 CM.getInterleavedAccessGroup(Instr); 6997 if (IG && Instr != IG->getInsertPos() && 6998 Range.Start >= 2 && // Query is illegal for VF == 1 6999 CM.getWideningDecision(Instr, Range.Start) == 7000 LoopVectorizationCostModel::CM_Interleave) { 7001 auto SinkCandidate = SinkAfterInverse.find(Instr); 7002 if (SinkCandidate != SinkAfterInverse.end()) 7003 Ingredients.push_back(SinkCandidate->second); 7004 continue; 7005 } 7006 7007 // Move instructions to handle first-order recurrences, step 1: avoid 7008 // handling this instruction until after we've handled the instruction it 7009 // should follow. 7010 auto SAIt = SinkAfter.find(Instr); 7011 if (SAIt != SinkAfter.end()) { 7012 LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" 7013 << *SAIt->second 7014 << " to vectorize a 1st order recurrence.\n"); 7015 SinkAfterInverse[SAIt->second] = Instr; 7016 continue; 7017 } 7018 7019 Ingredients.push_back(Instr); 7020 7021 // Move instructions to handle first-order recurrences, step 2: push the 7022 // instruction to be sunk at its insertion point. 7023 auto SAInvIt = SinkAfterInverse.find(Instr); 7024 if (SAInvIt != SinkAfterInverse.end()) 7025 Ingredients.push_back(SAInvIt->second); 7026 } 7027 7028 // Introduce each ingredient into VPlan. 7029 for (Instruction *Instr : Ingredients) { 7030 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) 7031 continue; 7032 7033 // Otherwise, if all widening options failed, Instruction is to be 7034 // replicated. This may create a successor for VPBB. 7035 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7036 Instr, Range, VPBB, PredInst2Recipe, Plan); 7037 if (NextVPBB != VPBB) { 7038 VPBB = NextVPBB; 7039 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7040 : ""); 7041 } 7042 } 7043 } 7044 7045 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7046 // may also be empty, such as the last one VPBB, reflecting original 7047 // basic-blocks with no recipes. 7048 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7049 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7050 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7051 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7052 delete PreEntry; 7053 7054 std::string PlanName; 7055 raw_string_ostream RSO(PlanName); 7056 unsigned VF = Range.Start; 7057 Plan->addVF(VF); 7058 RSO << "Initial VPlan for VF={" << VF; 7059 for (VF *= 2; VF < Range.End; VF *= 2) { 7060 Plan->addVF(VF); 7061 RSO << "," << VF; 7062 } 7063 RSO << "},UF>=1"; 7064 RSO.flush(); 7065 Plan->setName(PlanName); 7066 7067 return Plan; 7068 } 7069 7070 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7071 // Outer loop handling: They may require CFG and instruction level 7072 // transformations before even evaluating whether vectorization is profitable. 7073 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7074 // the vectorization pipeline. 7075 assert(!OrigLoop->empty()); 7076 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7077 7078 // Create new empty VPlan 7079 auto Plan = llvm::make_unique<VPlan>(); 7080 7081 // Build hierarchical CFG 7082 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7083 HCFGBuilder.buildHierarchicalCFG(); 7084 7085 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7086 Plan->addVF(VF); 7087 7088 if (EnableVPlanPredication) { 7089 VPlanPredicator VPP(*Plan); 7090 VPP.predicate(); 7091 7092 // Avoid running transformation to recipes until masked code generation in 7093 // VPlan-native path is in place. 7094 return Plan; 7095 } 7096 7097 SmallPtrSet<Instruction *, 1> DeadInstructions; 7098 VPlanHCFGTransforms::VPInstructionsToVPRecipes( 7099 Plan, Legal->getInductionVars(), DeadInstructions); 7100 7101 return Plan; 7102 } 7103 7104 Value* LoopVectorizationPlanner::VPCallbackILV:: 7105 getOrCreateVectorValues(Value *V, unsigned Part) { 7106 return ILV.getOrCreateVectorValue(V, Part); 7107 } 7108 7109 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { 7110 O << " +\n" 7111 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7112 IG->getInsertPos()->printAsOperand(O, false); 7113 if (User) { 7114 O << ", "; 7115 User->getOperand(0)->printAsOperand(O); 7116 } 7117 O << "\\l\""; 7118 for (unsigned i = 0; i < IG->getFactor(); ++i) 7119 if (Instruction *I = IG->getMember(i)) 7120 O << " +\n" 7121 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; 7122 } 7123 7124 void VPWidenRecipe::execute(VPTransformState &State) { 7125 for (auto &Instr : make_range(Begin, End)) 7126 State.ILV->widenInstruction(Instr); 7127 } 7128 7129 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7130 assert(!State.Instance && "Int or FP induction being replicated."); 7131 State.ILV->widenIntOrFpInduction(IV, Trunc); 7132 } 7133 7134 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7135 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7136 } 7137 7138 void VPBlendRecipe::execute(VPTransformState &State) { 7139 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7140 // We know that all PHIs in non-header blocks are converted into 7141 // selects, so we don't have to worry about the insertion order and we 7142 // can just use the builder. 7143 // At this point we generate the predication tree. There may be 7144 // duplications since this is a simple recursive scan, but future 7145 // optimizations will clean it up. 7146 7147 unsigned NumIncoming = Phi->getNumIncomingValues(); 7148 7149 assert((User || NumIncoming == 1) && 7150 "Multiple predecessors with predecessors having a full mask"); 7151 // Generate a sequence of selects of the form: 7152 // SELECT(Mask3, In3, 7153 // SELECT(Mask2, In2, 7154 // ( ...))) 7155 InnerLoopVectorizer::VectorParts Entry(State.UF); 7156 for (unsigned In = 0; In < NumIncoming; ++In) { 7157 for (unsigned Part = 0; Part < State.UF; ++Part) { 7158 // We might have single edge PHIs (blocks) - use an identity 7159 // 'select' for the first PHI operand. 7160 Value *In0 = 7161 State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part); 7162 if (In == 0) 7163 Entry[Part] = In0; // Initialize with the first incoming value. 7164 else { 7165 // Select between the current value and the previous incoming edge 7166 // based on the incoming mask. 7167 Value *Cond = State.get(User->getOperand(In), Part); 7168 Entry[Part] = 7169 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7170 } 7171 } 7172 } 7173 for (unsigned Part = 0; Part < State.UF; ++Part) 7174 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7175 } 7176 7177 void VPInterleaveRecipe::execute(VPTransformState &State) { 7178 assert(!State.Instance && "Interleave group being replicated."); 7179 if (!User) 7180 return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); 7181 7182 // Last (and currently only) operand is a mask. 7183 InnerLoopVectorizer::VectorParts MaskValues(State.UF); 7184 VPValue *Mask = User->getOperand(User->getNumOperands() - 1); 7185 for (unsigned Part = 0; Part < State.UF; ++Part) 7186 MaskValues[Part] = State.get(Mask, Part); 7187 State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues); 7188 } 7189 7190 void VPReplicateRecipe::execute(VPTransformState &State) { 7191 if (State.Instance) { // Generate a single instance. 7192 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); 7193 // Insert scalar instance packing it into a vector. 7194 if (AlsoPack && State.VF > 1) { 7195 // If we're constructing lane 0, initialize to start from undef. 7196 if (State.Instance->Lane == 0) { 7197 Value *Undef = 7198 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7199 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7200 } 7201 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7202 } 7203 return; 7204 } 7205 7206 // Generate scalar instances for all VF lanes of all UF parts, unless the 7207 // instruction is uniform inwhich case generate only the first lane for each 7208 // of the UF parts. 7209 unsigned EndLane = IsUniform ? 1 : State.VF; 7210 for (unsigned Part = 0; Part < State.UF; ++Part) 7211 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7212 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); 7213 } 7214 7215 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7216 assert(State.Instance && "Branch on Mask works only on single instance."); 7217 7218 unsigned Part = State.Instance->Part; 7219 unsigned Lane = State.Instance->Lane; 7220 7221 Value *ConditionBit = nullptr; 7222 if (!User) // Block in mask is all-one. 7223 ConditionBit = State.Builder.getTrue(); 7224 else { 7225 VPValue *BlockInMask = User->getOperand(0); 7226 ConditionBit = State.get(BlockInMask, Part); 7227 if (ConditionBit->getType()->isVectorTy()) 7228 ConditionBit = State.Builder.CreateExtractElement( 7229 ConditionBit, State.Builder.getInt32(Lane)); 7230 } 7231 7232 // Replace the temporary unreachable terminator with a new conditional branch, 7233 // whose two destinations will be set later when they are created. 7234 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7235 assert(isa<UnreachableInst>(CurrentTerminator) && 7236 "Expected to replace unreachable terminator with conditional branch."); 7237 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7238 CondBr->setSuccessor(0, nullptr); 7239 ReplaceInstWithInst(CurrentTerminator, CondBr); 7240 } 7241 7242 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7243 assert(State.Instance && "Predicated instruction PHI works per instance."); 7244 Instruction *ScalarPredInst = cast<Instruction>( 7245 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7246 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7247 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7248 assert(PredicatingBB && "Predicated block has no single predecessor."); 7249 7250 // By current pack/unpack logic we need to generate only a single phi node: if 7251 // a vector value for the predicated instruction exists at this point it means 7252 // the instruction has vector users only, and a phi for the vector value is 7253 // needed. In this case the recipe of the predicated instruction is marked to 7254 // also do that packing, thereby "hoisting" the insert-element sequence. 7255 // Otherwise, a phi node for the scalar value is needed. 7256 unsigned Part = State.Instance->Part; 7257 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7258 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7259 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7260 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7261 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7262 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7263 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7264 } else { 7265 Type *PredInstType = PredInst->getType(); 7266 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7267 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7268 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7269 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7270 } 7271 } 7272 7273 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7274 if (!User) 7275 return State.ILV->vectorizeMemoryInstruction(&Instr); 7276 7277 // Last (and currently only) operand is a mask. 7278 InnerLoopVectorizer::VectorParts MaskValues(State.UF); 7279 VPValue *Mask = User->getOperand(User->getNumOperands() - 1); 7280 for (unsigned Part = 0; Part < State.UF; ++Part) 7281 MaskValues[Part] = State.get(Mask, Part); 7282 State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); 7283 } 7284 7285 static ScalarEpilogueLowering 7286 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, 7287 ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { 7288 ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; 7289 if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && 7290 (F->hasOptSize() || 7291 llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) 7292 SEL = CM_ScalarEpilogueNotAllowedOptSize; 7293 else if (Hints.getPredicate()) 7294 SEL = CM_ScalarEpilogueNotNeededPredicatePragma; 7295 7296 return SEL; 7297 } 7298 7299 // Process the loop in the VPlan-native vectorization path. This path builds 7300 // VPlan upfront in the vectorization pipeline, which allows to apply 7301 // VPlan-to-VPlan transformations from the very beginning without modifying the 7302 // input LLVM IR. 7303 static bool processLoopInVPlanNativePath( 7304 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7305 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7306 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7307 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7308 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7309 7310 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7311 Function *F = L->getHeader()->getParent(); 7312 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7313 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); 7314 7315 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7316 &Hints, IAI); 7317 // Use the planner for outer loop vectorization. 7318 // TODO: CM is not used at this point inside the planner. Turn CM into an 7319 // optional argument if we don't need it in the future. 7320 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM); 7321 7322 // Get user vectorization factor. 7323 const unsigned UserVF = Hints.getWidth(); 7324 7325 // Plan how to best vectorize, return the best VF and its cost. 7326 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7327 7328 // If we are stress testing VPlan builds, do not attempt to generate vector 7329 // code. Masked vector code generation support will follow soon. 7330 // Also, do not attempt to vectorize if no vector code will be produced. 7331 if (VPlanBuildStressTest || EnableVPlanPredication || 7332 VectorizationFactor::Disabled() == VF) 7333 return false; 7334 7335 LVP.setBestPlan(VF.Width, 1); 7336 7337 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7338 &CM); 7339 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7340 << L->getHeader()->getParent()->getName() << "\"\n"); 7341 LVP.executePlan(LB, DT); 7342 7343 // Mark the loop as already vectorized to avoid vectorizing again. 7344 Hints.setAlreadyVectorized(); 7345 7346 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7347 return true; 7348 } 7349 7350 bool LoopVectorizePass::processLoop(Loop *L) { 7351 assert((EnableVPlanNativePath || L->empty()) && 7352 "VPlan-native path is not enabled. Only process inner loops."); 7353 7354 #ifndef NDEBUG 7355 const std::string DebugLocStr = getDebugLocString(L); 7356 #endif /* NDEBUG */ 7357 7358 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7359 << L->getHeader()->getParent()->getName() << "\" from " 7360 << DebugLocStr << "\n"); 7361 7362 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7363 7364 LLVM_DEBUG( 7365 dbgs() << "LV: Loop hints:" 7366 << " force=" 7367 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7368 ? "disabled" 7369 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7370 ? "enabled" 7371 : "?")) 7372 << " width=" << Hints.getWidth() 7373 << " unroll=" << Hints.getInterleave() << "\n"); 7374 7375 // Function containing loop 7376 Function *F = L->getHeader()->getParent(); 7377 7378 // Looking at the diagnostic output is the only way to determine if a loop 7379 // was vectorized (other than looking at the IR or machine code), so it 7380 // is important to generate an optimization remark for each loop. Most of 7381 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7382 // generated as OptimizationRemark and OptimizationRemarkMissed are 7383 // less verbose reporting vectorized loops and unvectorized loops that may 7384 // benefit from vectorization, respectively. 7385 7386 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7387 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7388 return false; 7389 } 7390 7391 PredicatedScalarEvolution PSE(*SE, *L); 7392 7393 // Check if it is legal to vectorize the loop. 7394 LoopVectorizationRequirements Requirements(*ORE); 7395 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7396 &Requirements, &Hints, DB, AC); 7397 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7398 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7399 Hints.emitRemarkWithHints(); 7400 return false; 7401 } 7402 7403 // Check the function attributes and profiles to find out if this function 7404 // should be optimized for size. 7405 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); 7406 7407 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7408 // here. They may require CFG and instruction level transformations before 7409 // even evaluating whether vectorization is profitable. Since we cannot modify 7410 // the incoming IR, we need to build VPlan upfront in the vectorization 7411 // pipeline. 7412 if (!L->empty()) 7413 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7414 ORE, BFI, PSI, Hints); 7415 7416 assert(L->empty() && "Inner loop expected."); 7417 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7418 // count by optimizing for size, to minimize overheads. 7419 // Prefer constant trip counts over profile data, over upper bound estimate. 7420 unsigned ExpectedTC = 0; 7421 bool HasExpectedTC = false; 7422 if (const SCEVConstant *ConstExits = 7423 dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) { 7424 const APInt &ExitsCount = ConstExits->getAPInt(); 7425 // We are interested in small values for ExpectedTC. Skip over those that 7426 // can't fit an unsigned. 7427 if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) { 7428 ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1; 7429 HasExpectedTC = true; 7430 } 7431 } 7432 // ExpectedTC may be large because it's bound by a variable. Check 7433 // profiling information to validate we should vectorize. 7434 if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) { 7435 auto EstimatedTC = getLoopEstimatedTripCount(L); 7436 if (EstimatedTC) { 7437 ExpectedTC = *EstimatedTC; 7438 HasExpectedTC = true; 7439 } 7440 } 7441 if (!HasExpectedTC) { 7442 ExpectedTC = SE->getSmallConstantMaxTripCount(L); 7443 HasExpectedTC = (ExpectedTC > 0); 7444 } 7445 7446 if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) { 7447 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7448 << "This loop is worth vectorizing only if no scalar " 7449 << "iteration overheads are incurred."); 7450 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7451 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7452 else { 7453 LLVM_DEBUG(dbgs() << "\n"); 7454 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7455 } 7456 } 7457 7458 // Check the function attributes to see if implicit floats are allowed. 7459 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7460 // an integer loop and the vector instructions selected are purely integer 7461 // vector instructions? 7462 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7463 reportVectorizationFailure( 7464 "Can't vectorize when the NoImplicitFloat attribute is used", 7465 "loop not vectorized due to NoImplicitFloat attribute", 7466 "NoImplicitFloat", ORE, L); 7467 Hints.emitRemarkWithHints(); 7468 return false; 7469 } 7470 7471 // Check if the target supports potentially unsafe FP vectorization. 7472 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7473 // for the target we're vectorizing for, to make sure none of the 7474 // additional fp-math flags can help. 7475 if (Hints.isPotentiallyUnsafe() && 7476 TTI->isFPVectorizationPotentiallyUnsafe()) { 7477 reportVectorizationFailure( 7478 "Potentially unsafe FP op prevents vectorization", 7479 "loop not vectorized due to unsafe FP support.", 7480 "UnsafeFP", ORE, L); 7481 Hints.emitRemarkWithHints(); 7482 return false; 7483 } 7484 7485 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7486 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7487 7488 // If an override option has been passed in for interleaved accesses, use it. 7489 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7490 UseInterleaved = EnableInterleavedMemAccesses; 7491 7492 // Analyze interleaved memory accesses. 7493 if (UseInterleaved) { 7494 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7495 } 7496 7497 // Use the cost model. 7498 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7499 F, &Hints, IAI); 7500 CM.collectValuesToIgnore(); 7501 7502 // Use the planner for vectorization. 7503 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM); 7504 7505 // Get user vectorization factor. 7506 unsigned UserVF = Hints.getWidth(); 7507 7508 // Plan how to best vectorize, return the best VF and its cost. 7509 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); 7510 7511 VectorizationFactor VF = VectorizationFactor::Disabled(); 7512 unsigned IC = 1; 7513 unsigned UserIC = Hints.getInterleave(); 7514 7515 if (MaybeVF) { 7516 VF = *MaybeVF; 7517 // Select the interleave count. 7518 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7519 } 7520 7521 // Identify the diagnostic messages that should be produced. 7522 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7523 bool VectorizeLoop = true, InterleaveLoop = true; 7524 if (Requirements.doesNotMeet(F, L, Hints)) { 7525 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7526 "requirements.\n"); 7527 Hints.emitRemarkWithHints(); 7528 return false; 7529 } 7530 7531 if (VF.Width == 1) { 7532 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7533 VecDiagMsg = std::make_pair( 7534 "VectorizationNotBeneficial", 7535 "the cost-model indicates that vectorization is not beneficial"); 7536 VectorizeLoop = false; 7537 } 7538 7539 if (!MaybeVF && UserIC > 1) { 7540 // Tell the user interleaving was avoided up-front, despite being explicitly 7541 // requested. 7542 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7543 "interleaving should be avoided up front\n"); 7544 IntDiagMsg = std::make_pair( 7545 "InterleavingAvoided", 7546 "Ignoring UserIC, because interleaving was avoided up front"); 7547 InterleaveLoop = false; 7548 } else if (IC == 1 && UserIC <= 1) { 7549 // Tell the user interleaving is not beneficial. 7550 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7551 IntDiagMsg = std::make_pair( 7552 "InterleavingNotBeneficial", 7553 "the cost-model indicates that interleaving is not beneficial"); 7554 InterleaveLoop = false; 7555 if (UserIC == 1) { 7556 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7557 IntDiagMsg.second += 7558 " and is explicitly disabled or interleave count is set to 1"; 7559 } 7560 } else if (IC > 1 && UserIC == 1) { 7561 // Tell the user interleaving is beneficial, but it explicitly disabled. 7562 LLVM_DEBUG( 7563 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7564 IntDiagMsg = std::make_pair( 7565 "InterleavingBeneficialButDisabled", 7566 "the cost-model indicates that interleaving is beneficial " 7567 "but is explicitly disabled or interleave count is set to 1"); 7568 InterleaveLoop = false; 7569 } 7570 7571 // Override IC if user provided an interleave count. 7572 IC = UserIC > 0 ? UserIC : IC; 7573 7574 // Emit diagnostic messages, if any. 7575 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7576 if (!VectorizeLoop && !InterleaveLoop) { 7577 // Do not vectorize or interleaving the loop. 7578 ORE->emit([&]() { 7579 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7580 L->getStartLoc(), L->getHeader()) 7581 << VecDiagMsg.second; 7582 }); 7583 ORE->emit([&]() { 7584 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7585 L->getStartLoc(), L->getHeader()) 7586 << IntDiagMsg.second; 7587 }); 7588 return false; 7589 } else if (!VectorizeLoop && InterleaveLoop) { 7590 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7591 ORE->emit([&]() { 7592 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7593 L->getStartLoc(), L->getHeader()) 7594 << VecDiagMsg.second; 7595 }); 7596 } else if (VectorizeLoop && !InterleaveLoop) { 7597 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7598 << ") in " << DebugLocStr << '\n'); 7599 ORE->emit([&]() { 7600 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7601 L->getStartLoc(), L->getHeader()) 7602 << IntDiagMsg.second; 7603 }); 7604 } else if (VectorizeLoop && InterleaveLoop) { 7605 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7606 << ") in " << DebugLocStr << '\n'); 7607 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7608 } 7609 7610 LVP.setBestPlan(VF.Width, IC); 7611 7612 using namespace ore; 7613 bool DisableRuntimeUnroll = false; 7614 MDNode *OrigLoopID = L->getLoopID(); 7615 7616 if (!VectorizeLoop) { 7617 assert(IC > 1 && "interleave count should not be 1 or 0"); 7618 // If we decided that it is not legal to vectorize the loop, then 7619 // interleave it. 7620 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7621 &CM); 7622 LVP.executePlan(Unroller, DT); 7623 7624 ORE->emit([&]() { 7625 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7626 L->getHeader()) 7627 << "interleaved loop (interleaved count: " 7628 << NV("InterleaveCount", IC) << ")"; 7629 }); 7630 } else { 7631 // If we decided that it is *legal* to vectorize the loop, then do it. 7632 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7633 &LVL, &CM); 7634 LVP.executePlan(LB, DT); 7635 ++LoopsVectorized; 7636 7637 // Add metadata to disable runtime unrolling a scalar loop when there are 7638 // no runtime checks about strides and memory. A scalar loop that is 7639 // rarely used is not worth unrolling. 7640 if (!LB.areSafetyChecksAdded()) 7641 DisableRuntimeUnroll = true; 7642 7643 // Report the vectorization decision. 7644 ORE->emit([&]() { 7645 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7646 L->getHeader()) 7647 << "vectorized loop (vectorization width: " 7648 << NV("VectorizationFactor", VF.Width) 7649 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7650 }); 7651 } 7652 7653 Optional<MDNode *> RemainderLoopID = 7654 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7655 LLVMLoopVectorizeFollowupEpilogue}); 7656 if (RemainderLoopID.hasValue()) { 7657 L->setLoopID(RemainderLoopID.getValue()); 7658 } else { 7659 if (DisableRuntimeUnroll) 7660 AddRuntimeUnrollDisableMetaData(L); 7661 7662 // Mark the loop as already vectorized to avoid vectorizing again. 7663 Hints.setAlreadyVectorized(); 7664 } 7665 7666 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7667 return true; 7668 } 7669 7670 bool LoopVectorizePass::runImpl( 7671 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7672 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7673 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 7674 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 7675 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 7676 SE = &SE_; 7677 LI = &LI_; 7678 TTI = &TTI_; 7679 DT = &DT_; 7680 BFI = &BFI_; 7681 TLI = TLI_; 7682 AA = &AA_; 7683 AC = &AC_; 7684 GetLAA = &GetLAA_; 7685 DB = &DB_; 7686 ORE = &ORE_; 7687 PSI = PSI_; 7688 7689 // Don't attempt if 7690 // 1. the target claims to have no vector registers, and 7691 // 2. interleaving won't help ILP. 7692 // 7693 // The second condition is necessary because, even if the target has no 7694 // vector registers, loop vectorization may still enable scalar 7695 // interleaving. 7696 if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) 7697 return false; 7698 7699 bool Changed = false; 7700 7701 // The vectorizer requires loops to be in simplified form. 7702 // Since simplification may add new inner loops, it has to run before the 7703 // legality and profitability checks. This means running the loop vectorizer 7704 // will simplify all loops, regardless of whether anything end up being 7705 // vectorized. 7706 for (auto &L : *LI) 7707 Changed |= 7708 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 7709 7710 // Build up a worklist of inner-loops to vectorize. This is necessary as 7711 // the act of vectorizing or partially unrolling a loop creates new loops 7712 // and can invalidate iterators across the loops. 7713 SmallVector<Loop *, 8> Worklist; 7714 7715 for (Loop *L : *LI) 7716 collectSupportedLoops(*L, LI, ORE, Worklist); 7717 7718 LoopsAnalyzed += Worklist.size(); 7719 7720 // Now walk the identified inner loops. 7721 while (!Worklist.empty()) { 7722 Loop *L = Worklist.pop_back_val(); 7723 7724 // For the inner loops we actually process, form LCSSA to simplify the 7725 // transform. 7726 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 7727 7728 Changed |= processLoop(L); 7729 } 7730 7731 // Process each loop nest in the function. 7732 return Changed; 7733 } 7734 7735 PreservedAnalyses LoopVectorizePass::run(Function &F, 7736 FunctionAnalysisManager &AM) { 7737 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 7738 auto &LI = AM.getResult<LoopAnalysis>(F); 7739 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 7740 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 7741 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 7742 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 7743 auto &AA = AM.getResult<AAManager>(F); 7744 auto &AC = AM.getResult<AssumptionAnalysis>(F); 7745 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 7746 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 7747 MemorySSA *MSSA = EnableMSSALoopDependency 7748 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 7749 : nullptr; 7750 7751 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 7752 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 7753 [&](Loop &L) -> const LoopAccessInfo & { 7754 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 7755 return LAM.getResult<LoopAccessAnalysis>(L, AR); 7756 }; 7757 const ModuleAnalysisManager &MAM = 7758 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); 7759 ProfileSummaryInfo *PSI = 7760 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 7761 bool Changed = 7762 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 7763 if (!Changed) 7764 return PreservedAnalyses::all(); 7765 PreservedAnalyses PA; 7766 7767 // We currently do not preserve loopinfo/dominator analyses with outer loop 7768 // vectorization. Until this is addressed, mark these analyses as preserved 7769 // only for non-VPlan-native path. 7770 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 7771 if (!EnableVPlanNativePath) { 7772 PA.preserve<LoopAnalysis>(); 7773 PA.preserve<DominatorTreeAnalysis>(); 7774 } 7775 PA.preserve<BasicAA>(); 7776 PA.preserve<GlobalsAA>(); 7777 return PA; 7778 } 7779