1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanHCFGTransforms.h" 62 #include "VPlanPredicator.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpander.h" 95 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 96 #include "llvm/Analysis/TargetLibraryInfo.h" 97 #include "llvm/Analysis/TargetTransformInfo.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfoMetadata.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/LLVMContext.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/Pass.h" 128 #include "llvm/Support/Casting.h" 129 #include "llvm/Support/CommandLine.h" 130 #include "llvm/Support/Compiler.h" 131 #include "llvm/Support/Debug.h" 132 #include "llvm/Support/ErrorHandling.h" 133 #include "llvm/Support/MathExtras.h" 134 #include "llvm/Support/raw_ostream.h" 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 136 #include "llvm/Transforms/Utils/LoopSimplify.h" 137 #include "llvm/Transforms/Utils/LoopUtils.h" 138 #include "llvm/Transforms/Utils/LoopVersioning.h" 139 #include "llvm/Transforms/Utils/SizeOpts.h" 140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 141 #include <algorithm> 142 #include <cassert> 143 #include <cstdint> 144 #include <cstdlib> 145 #include <functional> 146 #include <iterator> 147 #include <limits> 148 #include <memory> 149 #include <string> 150 #include <tuple> 151 #include <utility> 152 #include <vector> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 /// @{ 160 /// Metadata attribute names 161 static const char *const LLVMLoopVectorizeFollowupAll = 162 "llvm.loop.vectorize.followup_all"; 163 static const char *const LLVMLoopVectorizeFollowupVectorized = 164 "llvm.loop.vectorize.followup_vectorized"; 165 static const char *const LLVMLoopVectorizeFollowupEpilogue = 166 "llvm.loop.vectorize.followup_epilogue"; 167 /// @} 168 169 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 171 172 /// Loops with a known constant trip count below this number are vectorized only 173 /// if no scalar iteration overheads are incurred. 174 static cl::opt<unsigned> TinyTripCountVectorThreshold( 175 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 176 cl::desc("Loops with a constant trip count that is smaller than this " 177 "value are vectorized only if no scalar iteration overheads " 178 "are incurred.")); 179 180 static cl::opt<bool> MaximizeBandwidth( 181 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 182 cl::desc("Maximize bandwidth when selecting vectorization factor which " 183 "will be determined by the smallest type in loop.")); 184 185 static cl::opt<bool> EnableInterleavedMemAccesses( 186 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 187 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 188 189 /// An interleave-group may need masking if it resides in a block that needs 190 /// predication, or in order to mask away gaps. 191 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 192 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 193 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 194 195 /// We don't interleave loops with a known constant trip count below this 196 /// number. 197 static const unsigned TinyTripCountInterleaveThreshold = 128; 198 199 static cl::opt<unsigned> ForceTargetNumScalarRegs( 200 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 201 cl::desc("A flag that overrides the target's number of scalar registers.")); 202 203 static cl::opt<unsigned> ForceTargetNumVectorRegs( 204 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 205 cl::desc("A flag that overrides the target's number of vector registers.")); 206 207 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 208 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 209 cl::desc("A flag that overrides the target's max interleave factor for " 210 "scalar loops.")); 211 212 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 213 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 214 cl::desc("A flag that overrides the target's max interleave factor for " 215 "vectorized loops.")); 216 217 static cl::opt<unsigned> ForceTargetInstructionCost( 218 "force-target-instruction-cost", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's expected cost for " 220 "an instruction to a single constant value. Mostly " 221 "useful for getting consistent testing.")); 222 223 static cl::opt<unsigned> SmallLoopCost( 224 "small-loop-cost", cl::init(20), cl::Hidden, 225 cl::desc( 226 "The cost of a loop that is considered 'small' by the interleaver.")); 227 228 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 229 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 230 cl::desc("Enable the use of the block frequency analysis to access PGO " 231 "heuristics minimizing code growth in cold regions and being more " 232 "aggressive in hot regions.")); 233 234 // Runtime interleave loops for load/store throughput. 235 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 236 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 237 cl::desc( 238 "Enable runtime interleaving until load/store ports are saturated")); 239 240 /// The number of stores in a loop that are allowed to need predication. 241 static cl::opt<unsigned> NumberOfStoresToPredicate( 242 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 243 cl::desc("Max number of stores to be predicated behind an if.")); 244 245 static cl::opt<bool> EnableIndVarRegisterHeur( 246 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 247 cl::desc("Count the induction variable only once when interleaving")); 248 249 static cl::opt<bool> EnableCondStoresVectorization( 250 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 251 cl::desc("Enable if predication of stores during vectorization.")); 252 253 static cl::opt<unsigned> MaxNestedScalarReductionIC( 254 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 255 cl::desc("The maximum interleave count to use when interleaving a scalar " 256 "reduction in a nested loop.")); 257 258 cl::opt<bool> EnableVPlanNativePath( 259 "enable-vplan-native-path", cl::init(false), cl::Hidden, 260 cl::desc("Enable VPlan-native vectorization path with " 261 "support for outer loop vectorization.")); 262 263 // FIXME: Remove this switch once we have divergence analysis. Currently we 264 // assume divergent non-backedge branches when this switch is true. 265 cl::opt<bool> EnableVPlanPredication( 266 "enable-vplan-predication", cl::init(false), cl::Hidden, 267 cl::desc("Enable VPlan-native vectorization path predicator with " 268 "support for outer loop vectorization.")); 269 270 // This flag enables the stress testing of the VPlan H-CFG construction in the 271 // VPlan-native vectorization path. It must be used in conjuction with 272 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 273 // verification of the H-CFGs built. 274 static cl::opt<bool> VPlanBuildStressTest( 275 "vplan-build-stress-test", cl::init(false), cl::Hidden, 276 cl::desc( 277 "Build VPlan for every supported loop nest in the function and bail " 278 "out right after the build (stress test the VPlan H-CFG construction " 279 "in the VPlan-native vectorization path).")); 280 281 cl::opt<bool> llvm::EnableLoopInterleaving( 282 "interleave-loops", cl::init(true), cl::Hidden, 283 cl::desc("Enable loop interleaving in Loop vectorization passes")); 284 cl::opt<bool> llvm::EnableLoopVectorization( 285 "vectorize-loops", cl::init(true), cl::Hidden, 286 cl::desc("Run the Loop vectorization passes")); 287 288 /// A helper function for converting Scalar types to vector types. 289 /// If the incoming type is void, we return void. If the VF is 1, we return 290 /// the scalar type. 291 static Type *ToVectorTy(Type *Scalar, unsigned VF) { 292 if (Scalar->isVoidTy() || VF == 1) 293 return Scalar; 294 return VectorType::get(Scalar, VF); 295 } 296 297 /// A helper function that returns the type of loaded or stored value. 298 static Type *getMemInstValueType(Value *I) { 299 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 300 "Expected Load or Store instruction"); 301 if (auto *LI = dyn_cast<LoadInst>(I)) 302 return LI->getType(); 303 return cast<StoreInst>(I)->getValueOperand()->getType(); 304 } 305 306 /// A helper function that returns true if the given type is irregular. The 307 /// type is irregular if its allocated size doesn't equal the store size of an 308 /// element of the corresponding vector type at the given vectorization factor. 309 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 310 // Determine if an array of VF elements of type Ty is "bitcast compatible" 311 // with a <VF x Ty> vector. 312 if (VF > 1) { 313 auto *VectorTy = VectorType::get(Ty, VF); 314 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 315 } 316 317 // If the vectorization factor is one, we just check if an array of type Ty 318 // requires padding between elements. 319 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 320 } 321 322 /// A helper function that returns the reciprocal of the block probability of 323 /// predicated blocks. If we return X, we are assuming the predicated block 324 /// will execute once for every X iterations of the loop header. 325 /// 326 /// TODO: We should use actual block probability here, if available. Currently, 327 /// we always assume predicated blocks have a 50% chance of executing. 328 static unsigned getReciprocalPredBlockProb() { return 2; } 329 330 /// A helper function that adds a 'fast' flag to floating-point operations. 331 static Value *addFastMathFlag(Value *V) { 332 if (isa<FPMathOperator>(V)) 333 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 334 return V; 335 } 336 337 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 338 if (isa<FPMathOperator>(V)) 339 cast<Instruction>(V)->setFastMathFlags(FMF); 340 return V; 341 } 342 343 /// A helper function that returns an integer or floating-point constant with 344 /// value C. 345 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 346 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 347 : ConstantFP::get(Ty, C); 348 } 349 350 namespace llvm { 351 352 /// InnerLoopVectorizer vectorizes loops which contain only one basic 353 /// block to a specified vectorization factor (VF). 354 /// This class performs the widening of scalars into vectors, or multiple 355 /// scalars. This class also implements the following features: 356 /// * It inserts an epilogue loop for handling loops that don't have iteration 357 /// counts that are known to be a multiple of the vectorization factor. 358 /// * It handles the code generation for reduction variables. 359 /// * Scalarization (implementation using scalars) of un-vectorizable 360 /// instructions. 361 /// InnerLoopVectorizer does not perform any vectorization-legality 362 /// checks, and relies on the caller to check for the different legality 363 /// aspects. The InnerLoopVectorizer relies on the 364 /// LoopVectorizationLegality class to provide information about the induction 365 /// and reduction variables that were found to a given vectorization factor. 366 class InnerLoopVectorizer { 367 public: 368 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 369 LoopInfo *LI, DominatorTree *DT, 370 const TargetLibraryInfo *TLI, 371 const TargetTransformInfo *TTI, AssumptionCache *AC, 372 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 373 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 374 LoopVectorizationCostModel *CM) 375 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 376 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 377 Builder(PSE.getSE()->getContext()), 378 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 379 virtual ~InnerLoopVectorizer() = default; 380 381 /// Create a new empty loop. Unlink the old loop and connect the new one. 382 /// Return the pre-header block of the new loop. 383 BasicBlock *createVectorizedLoopSkeleton(); 384 385 /// Widen a single instruction within the innermost loop. 386 void widenInstruction(Instruction &I); 387 388 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 389 void fixVectorizedLoop(); 390 391 // Return true if any runtime check is added. 392 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 393 394 /// A type for vectorized values in the new loop. Each value from the 395 /// original loop, when vectorized, is represented by UF vector values in the 396 /// new unrolled loop, where UF is the unroll factor. 397 using VectorParts = SmallVector<Value *, 2>; 398 399 /// Vectorize a single PHINode in a block. This method handles the induction 400 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 401 /// arbitrary length vectors. 402 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 403 404 /// A helper function to scalarize a single Instruction in the innermost loop. 405 /// Generates a sequence of scalar instances for each lane between \p MinLane 406 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 407 /// inclusive.. 408 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, 409 bool IfPredicateInstr); 410 411 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 412 /// is provided, the integer induction variable will first be truncated to 413 /// the corresponding type. 414 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 415 416 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 417 /// vector or scalar value on-demand if one is not yet available. When 418 /// vectorizing a loop, we visit the definition of an instruction before its 419 /// uses. When visiting the definition, we either vectorize or scalarize the 420 /// instruction, creating an entry for it in the corresponding map. (In some 421 /// cases, such as induction variables, we will create both vector and scalar 422 /// entries.) Then, as we encounter uses of the definition, we derive values 423 /// for each scalar or vector use unless such a value is already available. 424 /// For example, if we scalarize a definition and one of its uses is vector, 425 /// we build the required vector on-demand with an insertelement sequence 426 /// when visiting the use. Otherwise, if the use is scalar, we can use the 427 /// existing scalar definition. 428 /// 429 /// Return a value in the new loop corresponding to \p V from the original 430 /// loop at unroll index \p Part. If the value has already been vectorized, 431 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 432 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 433 /// a new vector value on-demand by inserting the scalar values into a vector 434 /// with an insertelement sequence. If the value has been neither vectorized 435 /// nor scalarized, it must be loop invariant, so we simply broadcast the 436 /// value into a vector. 437 Value *getOrCreateVectorValue(Value *V, unsigned Part); 438 439 /// Return a value in the new loop corresponding to \p V from the original 440 /// loop at unroll and vector indices \p Instance. If the value has been 441 /// vectorized but not scalarized, the necessary extractelement instruction 442 /// will be generated. 443 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 444 445 /// Construct the vector value of a scalarized value \p V one lane at a time. 446 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 447 448 /// Try to vectorize the interleaved access group that \p Instr belongs to, 449 /// optionally masking the vector operations if \p BlockInMask is non-null. 450 void vectorizeInterleaveGroup(Instruction *Instr, 451 VectorParts *BlockInMask = nullptr); 452 453 /// Vectorize Load and Store instructions, optionally masking the vector 454 /// operations if \p BlockInMask is non-null. 455 void vectorizeMemoryInstruction(Instruction *Instr, 456 VectorParts *BlockInMask = nullptr); 457 458 /// Set the debug location in the builder using the debug location in 459 /// the instruction. 460 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 461 462 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 463 void fixNonInductionPHIs(void); 464 465 protected: 466 friend class LoopVectorizationPlanner; 467 468 /// A small list of PHINodes. 469 using PhiVector = SmallVector<PHINode *, 4>; 470 471 /// A type for scalarized values in the new loop. Each value from the 472 /// original loop, when scalarized, is represented by UF x VF scalar values 473 /// in the new unrolled loop, where UF is the unroll factor and VF is the 474 /// vectorization factor. 475 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 476 477 /// Set up the values of the IVs correctly when exiting the vector loop. 478 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 479 Value *CountRoundDown, Value *EndValue, 480 BasicBlock *MiddleBlock); 481 482 /// Create a new induction variable inside L. 483 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 484 Value *Step, Instruction *DL); 485 486 /// Handle all cross-iteration phis in the header. 487 void fixCrossIterationPHIs(); 488 489 /// Fix a first-order recurrence. This is the second phase of vectorizing 490 /// this phi node. 491 void fixFirstOrderRecurrence(PHINode *Phi); 492 493 /// Fix a reduction cross-iteration phi. This is the second phase of 494 /// vectorizing this phi node. 495 void fixReduction(PHINode *Phi); 496 497 /// The Loop exit block may have single value PHI nodes with some 498 /// incoming value. While vectorizing we only handled real values 499 /// that were defined inside the loop and we should have one value for 500 /// each predecessor of its parent basic block. See PR14725. 501 void fixLCSSAPHIs(); 502 503 /// Iteratively sink the scalarized operands of a predicated instruction into 504 /// the block that was created for it. 505 void sinkScalarOperands(Instruction *PredInst); 506 507 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 508 /// represented as. 509 void truncateToMinimalBitwidths(); 510 511 /// Insert the new loop to the loop hierarchy and pass manager 512 /// and update the analysis passes. 513 void updateAnalysis(); 514 515 /// Create a broadcast instruction. This method generates a broadcast 516 /// instruction (shuffle) for loop invariant values and for the induction 517 /// value. If this is the induction variable then we extend it to N, N+1, ... 518 /// this is needed because each iteration in the loop corresponds to a SIMD 519 /// element. 520 virtual Value *getBroadcastInstrs(Value *V); 521 522 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 523 /// to each vector element of Val. The sequence starts at StartIndex. 524 /// \p Opcode is relevant for FP induction variable. 525 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 526 Instruction::BinaryOps Opcode = 527 Instruction::BinaryOpsEnd); 528 529 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 530 /// variable on which to base the steps, \p Step is the size of the step, and 531 /// \p EntryVal is the value from the original loop that maps to the steps. 532 /// Note that \p EntryVal doesn't have to be an induction variable - it 533 /// can also be a truncate instruction. 534 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 535 const InductionDescriptor &ID); 536 537 /// Create a vector induction phi node based on an existing scalar one. \p 538 /// EntryVal is the value from the original loop that maps to the vector phi 539 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 540 /// truncate instruction, instead of widening the original IV, we widen a 541 /// version of the IV truncated to \p EntryVal's type. 542 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 543 Value *Step, Instruction *EntryVal); 544 545 /// Returns true if an instruction \p I should be scalarized instead of 546 /// vectorized for the chosen vectorization factor. 547 bool shouldScalarizeInstruction(Instruction *I) const; 548 549 /// Returns true if we should generate a scalar version of \p IV. 550 bool needsScalarInduction(Instruction *IV) const; 551 552 /// If there is a cast involved in the induction variable \p ID, which should 553 /// be ignored in the vectorized loop body, this function records the 554 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 555 /// cast. We had already proved that the casted Phi is equal to the uncasted 556 /// Phi in the vectorized loop (under a runtime guard), and therefore 557 /// there is no need to vectorize the cast - the same value can be used in the 558 /// vector loop for both the Phi and the cast. 559 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 560 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 561 /// 562 /// \p EntryVal is the value from the original loop that maps to the vector 563 /// phi node and is used to distinguish what is the IV currently being 564 /// processed - original one (if \p EntryVal is a phi corresponding to the 565 /// original IV) or the "newly-created" one based on the proof mentioned above 566 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 567 /// latter case \p EntryVal is a TruncInst and we must not record anything for 568 /// that IV, but it's error-prone to expect callers of this routine to care 569 /// about that, hence this explicit parameter. 570 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 571 const Instruction *EntryVal, 572 Value *VectorLoopValue, 573 unsigned Part, 574 unsigned Lane = UINT_MAX); 575 576 /// Generate a shuffle sequence that will reverse the vector Vec. 577 virtual Value *reverseVector(Value *Vec); 578 579 /// Returns (and creates if needed) the original loop trip count. 580 Value *getOrCreateTripCount(Loop *NewLoop); 581 582 /// Returns (and creates if needed) the trip count of the widened loop. 583 Value *getOrCreateVectorTripCount(Loop *NewLoop); 584 585 /// Returns a bitcasted value to the requested vector type. 586 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 587 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 588 const DataLayout &DL); 589 590 /// Emit a bypass check to see if the vector trip count is zero, including if 591 /// it overflows. 592 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 593 594 /// Emit a bypass check to see if all of the SCEV assumptions we've 595 /// had to make are correct. 596 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 597 598 /// Emit bypass checks to check any memory assumptions we may have made. 599 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 600 601 /// Compute the transformed value of Index at offset StartValue using step 602 /// StepValue. 603 /// For integer induction, returns StartValue + Index * StepValue. 604 /// For pointer induction, returns StartValue[Index * StepValue]. 605 /// FIXME: The newly created binary instructions should contain nsw/nuw 606 /// flags, which can be found from the original scalar operations. 607 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 608 const DataLayout &DL, 609 const InductionDescriptor &ID) const; 610 611 /// Add additional metadata to \p To that was not present on \p Orig. 612 /// 613 /// Currently this is used to add the noalias annotations based on the 614 /// inserted memchecks. Use this for instructions that are *cloned* into the 615 /// vector loop. 616 void addNewMetadata(Instruction *To, const Instruction *Orig); 617 618 /// Add metadata from one instruction to another. 619 /// 620 /// This includes both the original MDs from \p From and additional ones (\see 621 /// addNewMetadata). Use this for *newly created* instructions in the vector 622 /// loop. 623 void addMetadata(Instruction *To, Instruction *From); 624 625 /// Similar to the previous function but it adds the metadata to a 626 /// vector of instructions. 627 void addMetadata(ArrayRef<Value *> To, Instruction *From); 628 629 /// The original loop. 630 Loop *OrigLoop; 631 632 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 633 /// dynamic knowledge to simplify SCEV expressions and converts them to a 634 /// more usable form. 635 PredicatedScalarEvolution &PSE; 636 637 /// Loop Info. 638 LoopInfo *LI; 639 640 /// Dominator Tree. 641 DominatorTree *DT; 642 643 /// Alias Analysis. 644 AliasAnalysis *AA; 645 646 /// Target Library Info. 647 const TargetLibraryInfo *TLI; 648 649 /// Target Transform Info. 650 const TargetTransformInfo *TTI; 651 652 /// Assumption Cache. 653 AssumptionCache *AC; 654 655 /// Interface to emit optimization remarks. 656 OptimizationRemarkEmitter *ORE; 657 658 /// LoopVersioning. It's only set up (non-null) if memchecks were 659 /// used. 660 /// 661 /// This is currently only used to add no-alias metadata based on the 662 /// memchecks. The actually versioning is performed manually. 663 std::unique_ptr<LoopVersioning> LVer; 664 665 /// The vectorization SIMD factor to use. Each vector will have this many 666 /// vector elements. 667 unsigned VF; 668 669 /// The vectorization unroll factor to use. Each scalar is vectorized to this 670 /// many different vector instructions. 671 unsigned UF; 672 673 /// The builder that we use 674 IRBuilder<> Builder; 675 676 // --- Vectorization state --- 677 678 /// The vector-loop preheader. 679 BasicBlock *LoopVectorPreHeader; 680 681 /// The scalar-loop preheader. 682 BasicBlock *LoopScalarPreHeader; 683 684 /// Middle Block between the vector and the scalar. 685 BasicBlock *LoopMiddleBlock; 686 687 /// The ExitBlock of the scalar loop. 688 BasicBlock *LoopExitBlock; 689 690 /// The vector loop body. 691 BasicBlock *LoopVectorBody; 692 693 /// The scalar loop body. 694 BasicBlock *LoopScalarBody; 695 696 /// A list of all bypass blocks. The first block is the entry of the loop. 697 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 698 699 /// The new Induction variable which was added to the new block. 700 PHINode *Induction = nullptr; 701 702 /// The induction variable of the old basic block. 703 PHINode *OldInduction = nullptr; 704 705 /// Maps values from the original loop to their corresponding values in the 706 /// vectorized loop. A key value can map to either vector values, scalar 707 /// values or both kinds of values, depending on whether the key was 708 /// vectorized and scalarized. 709 VectorizerValueMap VectorLoopValueMap; 710 711 /// Store instructions that were predicated. 712 SmallVector<Instruction *, 4> PredicatedInstructions; 713 714 /// Trip count of the original loop. 715 Value *TripCount = nullptr; 716 717 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 718 Value *VectorTripCount = nullptr; 719 720 /// The legality analysis. 721 LoopVectorizationLegality *Legal; 722 723 /// The profitablity analysis. 724 LoopVectorizationCostModel *Cost; 725 726 // Record whether runtime checks are added. 727 bool AddedSafetyChecks = false; 728 729 // Holds the end values for each induction variable. We save the end values 730 // so we can later fix-up the external users of the induction variables. 731 DenseMap<PHINode *, Value *> IVEndValues; 732 733 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 734 // fixed up at the end of vector code generation. 735 SmallVector<PHINode *, 8> OrigPHIsToFix; 736 }; 737 738 class InnerLoopUnroller : public InnerLoopVectorizer { 739 public: 740 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 741 LoopInfo *LI, DominatorTree *DT, 742 const TargetLibraryInfo *TLI, 743 const TargetTransformInfo *TTI, AssumptionCache *AC, 744 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 745 LoopVectorizationLegality *LVL, 746 LoopVectorizationCostModel *CM) 747 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 748 UnrollFactor, LVL, CM) {} 749 750 private: 751 Value *getBroadcastInstrs(Value *V) override; 752 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 753 Instruction::BinaryOps Opcode = 754 Instruction::BinaryOpsEnd) override; 755 Value *reverseVector(Value *Vec) override; 756 }; 757 758 } // end namespace llvm 759 760 /// Look for a meaningful debug location on the instruction or it's 761 /// operands. 762 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 763 if (!I) 764 return I; 765 766 DebugLoc Empty; 767 if (I->getDebugLoc() != Empty) 768 return I; 769 770 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 771 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 772 if (OpInst->getDebugLoc() != Empty) 773 return OpInst; 774 } 775 776 return I; 777 } 778 779 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 780 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 781 const DILocation *DIL = Inst->getDebugLoc(); 782 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 783 !isa<DbgInfoIntrinsic>(Inst)) { 784 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 785 if (NewDIL) 786 B.SetCurrentDebugLocation(NewDIL.getValue()); 787 else 788 LLVM_DEBUG(dbgs() 789 << "Failed to create new discriminator: " 790 << DIL->getFilename() << " Line: " << DIL->getLine()); 791 } 792 else 793 B.SetCurrentDebugLocation(DIL); 794 } else 795 B.SetCurrentDebugLocation(DebugLoc()); 796 } 797 798 /// Write a record \p DebugMsg about vectorization failure to the debug 799 /// output stream. If \p I is passed, it is an instruction that prevents 800 /// vectorization. 801 #ifndef NDEBUG 802 static void debugVectorizationFailure(const StringRef DebugMsg, 803 Instruction *I) { 804 dbgs() << "LV: Not vectorizing: " << DebugMsg; 805 if (I != nullptr) 806 dbgs() << " " << *I; 807 else 808 dbgs() << '.'; 809 dbgs() << '\n'; 810 } 811 #endif 812 813 /// Create an analysis remark that explains why vectorization failed 814 /// 815 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 816 /// RemarkName is the identifier for the remark. If \p I is passed it is an 817 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 818 /// the location of the remark. \return the remark object that can be 819 /// streamed to. 820 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 821 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 822 Value *CodeRegion = TheLoop->getHeader(); 823 DebugLoc DL = TheLoop->getStartLoc(); 824 825 if (I) { 826 CodeRegion = I->getParent(); 827 // If there is no debug location attached to the instruction, revert back to 828 // using the loop's. 829 if (I->getDebugLoc()) 830 DL = I->getDebugLoc(); 831 } 832 833 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 834 R << "loop not vectorized: "; 835 return R; 836 } 837 838 namespace llvm { 839 840 void reportVectorizationFailure(const StringRef DebugMsg, 841 const StringRef OREMsg, const StringRef ORETag, 842 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 843 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 844 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 845 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 846 ORETag, TheLoop, I) << OREMsg); 847 } 848 849 } // end namespace llvm 850 851 #ifndef NDEBUG 852 /// \return string containing a file name and a line # for the given loop. 853 static std::string getDebugLocString(const Loop *L) { 854 std::string Result; 855 if (L) { 856 raw_string_ostream OS(Result); 857 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 858 LoopDbgLoc.print(OS); 859 else 860 // Just print the module name. 861 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 862 OS.flush(); 863 } 864 return Result; 865 } 866 #endif 867 868 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 869 const Instruction *Orig) { 870 // If the loop was versioned with memchecks, add the corresponding no-alias 871 // metadata. 872 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 873 LVer->annotateInstWithNoAlias(To, Orig); 874 } 875 876 void InnerLoopVectorizer::addMetadata(Instruction *To, 877 Instruction *From) { 878 propagateMetadata(To, From); 879 addNewMetadata(To, From); 880 } 881 882 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 883 Instruction *From) { 884 for (Value *V : To) { 885 if (Instruction *I = dyn_cast<Instruction>(V)) 886 addMetadata(I, From); 887 } 888 } 889 890 namespace llvm { 891 892 // Loop vectorization cost-model hints how the scalar epilogue loop should be 893 // lowered. 894 enum ScalarEpilogueLowering { 895 896 // The default: allowing scalar epilogues. 897 CM_ScalarEpilogueAllowed, 898 899 // Vectorization with OptForSize: don't allow epilogues. 900 CM_ScalarEpilogueNotAllowedOptSize, 901 902 // A special case of vectorisation with OptForSize: loops with a very small 903 // trip count are considered for vectorization under OptForSize, thereby 904 // making sure the cost of their loop body is dominant, free of runtime 905 // guards and scalar iteration overheads. 906 CM_ScalarEpilogueNotAllowedLowTripLoop, 907 908 // Loop hint predicate indicating an epilogue is undesired. 909 CM_ScalarEpilogueNotNeededPredicatePragma 910 }; 911 912 /// LoopVectorizationCostModel - estimates the expected speedups due to 913 /// vectorization. 914 /// In many cases vectorization is not profitable. This can happen because of 915 /// a number of reasons. In this class we mainly attempt to predict the 916 /// expected speedup/slowdowns due to the supported instruction set. We use the 917 /// TargetTransformInfo to query the different backends for the cost of 918 /// different operations. 919 class LoopVectorizationCostModel { 920 public: 921 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 922 PredicatedScalarEvolution &PSE, LoopInfo *LI, 923 LoopVectorizationLegality *Legal, 924 const TargetTransformInfo &TTI, 925 const TargetLibraryInfo *TLI, DemandedBits *DB, 926 AssumptionCache *AC, 927 OptimizationRemarkEmitter *ORE, const Function *F, 928 const LoopVectorizeHints *Hints, 929 InterleavedAccessInfo &IAI) 930 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 931 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 932 Hints(Hints), InterleaveInfo(IAI) {} 933 934 /// \return An upper bound for the vectorization factor, or None if 935 /// vectorization and interleaving should be avoided up front. 936 Optional<unsigned> computeMaxVF(); 937 938 /// \return True if runtime checks are required for vectorization, and false 939 /// otherwise. 940 bool runtimeChecksRequired(); 941 942 /// \return The most profitable vectorization factor and the cost of that VF. 943 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 944 /// then this vectorization factor will be selected if vectorization is 945 /// possible. 946 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 947 948 /// Setup cost-based decisions for user vectorization factor. 949 void selectUserVectorizationFactor(unsigned UserVF) { 950 collectUniformsAndScalars(UserVF); 951 collectInstsToScalarize(UserVF); 952 } 953 954 /// \return The size (in bits) of the smallest and widest types in the code 955 /// that needs to be vectorized. We ignore values that remain scalar such as 956 /// 64 bit loop indices. 957 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 958 959 /// \return The desired interleave count. 960 /// If interleave count has been specified by metadata it will be returned. 961 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 962 /// are the selected vectorization factor and the cost of the selected VF. 963 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 964 965 /// Memory access instruction may be vectorized in more than one way. 966 /// Form of instruction after vectorization depends on cost. 967 /// This function takes cost-based decisions for Load/Store instructions 968 /// and collects them in a map. This decisions map is used for building 969 /// the lists of loop-uniform and loop-scalar instructions. 970 /// The calculated cost is saved with widening decision in order to 971 /// avoid redundant calculations. 972 void setCostBasedWideningDecision(unsigned VF); 973 974 /// A struct that represents some properties of the register usage 975 /// of a loop. 976 struct RegisterUsage { 977 /// Holds the number of loop invariant values that are used in the loop. 978 unsigned LoopInvariantRegs; 979 980 /// Holds the maximum number of concurrent live intervals in the loop. 981 unsigned MaxLocalUsers; 982 }; 983 984 /// \return Returns information about the register usages of the loop for the 985 /// given vectorization factors. 986 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 987 988 /// Collect values we want to ignore in the cost model. 989 void collectValuesToIgnore(); 990 991 /// \returns The smallest bitwidth each instruction can be represented with. 992 /// The vector equivalents of these instructions should be truncated to this 993 /// type. 994 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 995 return MinBWs; 996 } 997 998 /// \returns True if it is more profitable to scalarize instruction \p I for 999 /// vectorization factor \p VF. 1000 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1001 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1002 1003 // Cost model is not run in the VPlan-native path - return conservative 1004 // result until this changes. 1005 if (EnableVPlanNativePath) 1006 return false; 1007 1008 auto Scalars = InstsToScalarize.find(VF); 1009 assert(Scalars != InstsToScalarize.end() && 1010 "VF not yet analyzed for scalarization profitability"); 1011 return Scalars->second.find(I) != Scalars->second.end(); 1012 } 1013 1014 /// Returns true if \p I is known to be uniform after vectorization. 1015 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1016 if (VF == 1) 1017 return true; 1018 1019 // Cost model is not run in the VPlan-native path - return conservative 1020 // result until this changes. 1021 if (EnableVPlanNativePath) 1022 return false; 1023 1024 auto UniformsPerVF = Uniforms.find(VF); 1025 assert(UniformsPerVF != Uniforms.end() && 1026 "VF not yet analyzed for uniformity"); 1027 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); 1028 } 1029 1030 /// Returns true if \p I is known to be scalar after vectorization. 1031 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1032 if (VF == 1) 1033 return true; 1034 1035 // Cost model is not run in the VPlan-native path - return conservative 1036 // result until this changes. 1037 if (EnableVPlanNativePath) 1038 return false; 1039 1040 auto ScalarsPerVF = Scalars.find(VF); 1041 assert(ScalarsPerVF != Scalars.end() && 1042 "Scalar values are not calculated for VF"); 1043 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); 1044 } 1045 1046 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1047 /// for vectorization factor \p VF. 1048 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1049 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1050 !isProfitableToScalarize(I, VF) && 1051 !isScalarAfterVectorization(I, VF); 1052 } 1053 1054 /// Decision that was taken during cost calculation for memory instruction. 1055 enum InstWidening { 1056 CM_Unknown, 1057 CM_Widen, // For consecutive accesses with stride +1. 1058 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1059 CM_Interleave, 1060 CM_GatherScatter, 1061 CM_Scalarize 1062 }; 1063 1064 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1065 /// instruction \p I and vector width \p VF. 1066 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1067 unsigned Cost) { 1068 assert(VF >= 2 && "Expected VF >=2"); 1069 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1070 } 1071 1072 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1073 /// interleaving group \p Grp and vector width \p VF. 1074 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1075 InstWidening W, unsigned Cost) { 1076 assert(VF >= 2 && "Expected VF >=2"); 1077 /// Broadcast this decicion to all instructions inside the group. 1078 /// But the cost will be assigned to one instruction only. 1079 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1080 if (auto *I = Grp->getMember(i)) { 1081 if (Grp->getInsertPos() == I) 1082 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1083 else 1084 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1085 } 1086 } 1087 } 1088 1089 /// Return the cost model decision for the given instruction \p I and vector 1090 /// width \p VF. Return CM_Unknown if this instruction did not pass 1091 /// through the cost modeling. 1092 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1093 assert(VF >= 2 && "Expected VF >=2"); 1094 1095 // Cost model is not run in the VPlan-native path - return conservative 1096 // result until this changes. 1097 if (EnableVPlanNativePath) 1098 return CM_GatherScatter; 1099 1100 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1101 auto Itr = WideningDecisions.find(InstOnVF); 1102 if (Itr == WideningDecisions.end()) 1103 return CM_Unknown; 1104 return Itr->second.first; 1105 } 1106 1107 /// Return the vectorization cost for the given instruction \p I and vector 1108 /// width \p VF. 1109 unsigned getWideningCost(Instruction *I, unsigned VF) { 1110 assert(VF >= 2 && "Expected VF >=2"); 1111 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1112 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1113 "The cost is not calculated"); 1114 return WideningDecisions[InstOnVF].second; 1115 } 1116 1117 /// Return True if instruction \p I is an optimizable truncate whose operand 1118 /// is an induction variable. Such a truncate will be removed by adding a new 1119 /// induction variable with the destination type. 1120 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1121 // If the instruction is not a truncate, return false. 1122 auto *Trunc = dyn_cast<TruncInst>(I); 1123 if (!Trunc) 1124 return false; 1125 1126 // Get the source and destination types of the truncate. 1127 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1128 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1129 1130 // If the truncate is free for the given types, return false. Replacing a 1131 // free truncate with an induction variable would add an induction variable 1132 // update instruction to each iteration of the loop. We exclude from this 1133 // check the primary induction variable since it will need an update 1134 // instruction regardless. 1135 Value *Op = Trunc->getOperand(0); 1136 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1137 return false; 1138 1139 // If the truncated value is not an induction variable, return false. 1140 return Legal->isInductionPhi(Op); 1141 } 1142 1143 /// Collects the instructions to scalarize for each predicated instruction in 1144 /// the loop. 1145 void collectInstsToScalarize(unsigned VF); 1146 1147 /// Collect Uniform and Scalar values for the given \p VF. 1148 /// The sets depend on CM decision for Load/Store instructions 1149 /// that may be vectorized as interleave, gather-scatter or scalarized. 1150 void collectUniformsAndScalars(unsigned VF) { 1151 // Do the analysis once. 1152 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1153 return; 1154 setCostBasedWideningDecision(VF); 1155 collectLoopUniforms(VF); 1156 collectLoopScalars(VF); 1157 } 1158 1159 /// Returns true if the target machine supports masked store operation 1160 /// for the given \p DataType and kind of access to \p Ptr. 1161 bool isLegalMaskedStore(Type *DataType, Value *Ptr) { 1162 return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType); 1163 } 1164 1165 /// Returns true if the target machine supports masked load operation 1166 /// for the given \p DataType and kind of access to \p Ptr. 1167 bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { 1168 return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType); 1169 } 1170 1171 /// Returns true if the target machine supports masked scatter operation 1172 /// for the given \p DataType. 1173 bool isLegalMaskedScatter(Type *DataType) { 1174 return TTI.isLegalMaskedScatter(DataType); 1175 } 1176 1177 /// Returns true if the target machine supports masked gather operation 1178 /// for the given \p DataType. 1179 bool isLegalMaskedGather(Type *DataType) { 1180 return TTI.isLegalMaskedGather(DataType); 1181 } 1182 1183 /// Returns true if the target machine can represent \p V as a masked gather 1184 /// or scatter operation. 1185 bool isLegalGatherOrScatter(Value *V) { 1186 bool LI = isa<LoadInst>(V); 1187 bool SI = isa<StoreInst>(V); 1188 if (!LI && !SI) 1189 return false; 1190 auto *Ty = getMemInstValueType(V); 1191 return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); 1192 } 1193 1194 /// Returns true if \p I is an instruction that will be scalarized with 1195 /// predication. Such instructions include conditional stores and 1196 /// instructions that may divide by zero. 1197 /// If a non-zero VF has been calculated, we check if I will be scalarized 1198 /// predication for that VF. 1199 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1200 1201 // Returns true if \p I is an instruction that will be predicated either 1202 // through scalar predication or masked load/store or masked gather/scatter. 1203 // Superset of instructions that return true for isScalarWithPredication. 1204 bool isPredicatedInst(Instruction *I) { 1205 if (!blockNeedsPredication(I->getParent())) 1206 return false; 1207 // Loads and stores that need some form of masked operation are predicated 1208 // instructions. 1209 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1210 return Legal->isMaskRequired(I); 1211 return isScalarWithPredication(I); 1212 } 1213 1214 /// Returns true if \p I is a memory instruction with consecutive memory 1215 /// access that can be widened. 1216 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1217 1218 /// Returns true if \p I is a memory instruction in an interleaved-group 1219 /// of memory accesses that can be vectorized with wide vector loads/stores 1220 /// and shuffles. 1221 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1222 1223 /// Check if \p Instr belongs to any interleaved access group. 1224 bool isAccessInterleaved(Instruction *Instr) { 1225 return InterleaveInfo.isInterleaved(Instr); 1226 } 1227 1228 /// Get the interleaved access group that \p Instr belongs to. 1229 const InterleaveGroup<Instruction> * 1230 getInterleavedAccessGroup(Instruction *Instr) { 1231 return InterleaveInfo.getInterleaveGroup(Instr); 1232 } 1233 1234 /// Returns true if an interleaved group requires a scalar iteration 1235 /// to handle accesses with gaps, and there is nothing preventing us from 1236 /// creating a scalar epilogue. 1237 bool requiresScalarEpilogue() const { 1238 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1239 } 1240 1241 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1242 /// loop hint annotation. 1243 bool isScalarEpilogueAllowed() const { 1244 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1245 } 1246 1247 /// Returns true if all loop blocks should be masked to fold tail loop. 1248 bool foldTailByMasking() const { return FoldTailByMasking; } 1249 1250 bool blockNeedsPredication(BasicBlock *BB) { 1251 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1252 } 1253 1254 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1255 /// with factor VF. Return the cost of the instruction, including 1256 /// scalarization overhead if it's needed. 1257 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1258 1259 /// Estimate cost of a call instruction CI if it were vectorized with factor 1260 /// VF. Return the cost of the instruction, including scalarization overhead 1261 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1262 /// scalarized - 1263 /// i.e. either vector version isn't available, or is too expensive. 1264 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1265 1266 private: 1267 unsigned NumPredStores = 0; 1268 1269 /// \return An upper bound for the vectorization factor, larger than zero. 1270 /// One is returned if vectorization should best be avoided due to cost. 1271 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1272 1273 /// The vectorization cost is a combination of the cost itself and a boolean 1274 /// indicating whether any of the contributing operations will actually 1275 /// operate on 1276 /// vector values after type legalization in the backend. If this latter value 1277 /// is 1278 /// false, then all operations will be scalarized (i.e. no vectorization has 1279 /// actually taken place). 1280 using VectorizationCostTy = std::pair<unsigned, bool>; 1281 1282 /// Returns the expected execution cost. The unit of the cost does 1283 /// not matter because we use the 'cost' units to compare different 1284 /// vector widths. The cost that is returned is *not* normalized by 1285 /// the factor width. 1286 VectorizationCostTy expectedCost(unsigned VF); 1287 1288 /// Returns the execution time cost of an instruction for a given vector 1289 /// width. Vector width of one means scalar. 1290 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1291 1292 /// The cost-computation logic from getInstructionCost which provides 1293 /// the vector type as an output parameter. 1294 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1295 1296 /// Calculate vectorization cost of memory instruction \p I. 1297 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1298 1299 /// The cost computation for scalarized memory instruction. 1300 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1301 1302 /// The cost computation for interleaving group of memory instructions. 1303 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1304 1305 /// The cost computation for Gather/Scatter instruction. 1306 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1307 1308 /// The cost computation for widening instruction \p I with consecutive 1309 /// memory access. 1310 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1311 1312 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1313 /// Load: scalar load + broadcast. 1314 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1315 /// element) 1316 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1317 1318 /// Estimate the overhead of scalarizing an instruction. This is a 1319 /// convenience wrapper for the type-based getScalarizationOverhead API. 1320 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1321 1322 /// Returns whether the instruction is a load or store and will be a emitted 1323 /// as a vector operation. 1324 bool isConsecutiveLoadOrStore(Instruction *I); 1325 1326 /// Returns true if an artificially high cost for emulated masked memrefs 1327 /// should be used. 1328 bool useEmulatedMaskMemRefHack(Instruction *I); 1329 1330 /// Map of scalar integer values to the smallest bitwidth they can be legally 1331 /// represented as. The vector equivalents of these values should be truncated 1332 /// to this type. 1333 MapVector<Instruction *, uint64_t> MinBWs; 1334 1335 /// A type representing the costs for instructions if they were to be 1336 /// scalarized rather than vectorized. The entries are Instruction-Cost 1337 /// pairs. 1338 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1339 1340 /// A set containing all BasicBlocks that are known to present after 1341 /// vectorization as a predicated block. 1342 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1343 1344 /// Records whether it is allowed to have the original scalar loop execute at 1345 /// least once. This may be needed as a fallback loop in case runtime 1346 /// aliasing/dependence checks fail, or to handle the tail/remainder 1347 /// iterations when the trip count is unknown or doesn't divide by the VF, 1348 /// or as a peel-loop to handle gaps in interleave-groups. 1349 /// Under optsize and when the trip count is very small we don't allow any 1350 /// iterations to execute in the scalar loop. 1351 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1352 1353 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1354 bool FoldTailByMasking = false; 1355 1356 /// A map holding scalar costs for different vectorization factors. The 1357 /// presence of a cost for an instruction in the mapping indicates that the 1358 /// instruction will be scalarized when vectorizing with the associated 1359 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1360 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1361 1362 /// Holds the instructions known to be uniform after vectorization. 1363 /// The data is collected per VF. 1364 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1365 1366 /// Holds the instructions known to be scalar after vectorization. 1367 /// The data is collected per VF. 1368 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1369 1370 /// Holds the instructions (address computations) that are forced to be 1371 /// scalarized. 1372 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1373 1374 /// Returns the expected difference in cost from scalarizing the expression 1375 /// feeding a predicated instruction \p PredInst. The instructions to 1376 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1377 /// non-negative return value implies the expression will be scalarized. 1378 /// Currently, only single-use chains are considered for scalarization. 1379 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1380 unsigned VF); 1381 1382 /// Collect the instructions that are uniform after vectorization. An 1383 /// instruction is uniform if we represent it with a single scalar value in 1384 /// the vectorized loop corresponding to each vector iteration. Examples of 1385 /// uniform instructions include pointer operands of consecutive or 1386 /// interleaved memory accesses. Note that although uniformity implies an 1387 /// instruction will be scalar, the reverse is not true. In general, a 1388 /// scalarized instruction will be represented by VF scalar values in the 1389 /// vectorized loop, each corresponding to an iteration of the original 1390 /// scalar loop. 1391 void collectLoopUniforms(unsigned VF); 1392 1393 /// Collect the instructions that are scalar after vectorization. An 1394 /// instruction is scalar if it is known to be uniform or will be scalarized 1395 /// during vectorization. Non-uniform scalarized instructions will be 1396 /// represented by VF values in the vectorized loop, each corresponding to an 1397 /// iteration of the original scalar loop. 1398 void collectLoopScalars(unsigned VF); 1399 1400 /// Keeps cost model vectorization decision and cost for instructions. 1401 /// Right now it is used for memory instructions only. 1402 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1403 std::pair<InstWidening, unsigned>>; 1404 1405 DecisionList WideningDecisions; 1406 1407 /// Returns true if \p V is expected to be vectorized and it needs to be 1408 /// extracted. 1409 bool needsExtract(Value *V, unsigned VF) const { 1410 Instruction *I = dyn_cast<Instruction>(V); 1411 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1412 return false; 1413 1414 // Assume we can vectorize V (and hence we need extraction) if the 1415 // scalars are not computed yet. This can happen, because it is called 1416 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1417 // the scalars are collected. That should be a safe assumption in most 1418 // cases, because we check if the operands have vectorizable types 1419 // beforehand in LoopVectorizationLegality. 1420 return Scalars.find(VF) == Scalars.end() || 1421 !isScalarAfterVectorization(I, VF); 1422 }; 1423 1424 /// Returns a range containing only operands needing to be extracted. 1425 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1426 unsigned VF) { 1427 return SmallVector<Value *, 4>(make_filter_range( 1428 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1429 } 1430 1431 public: 1432 /// The loop that we evaluate. 1433 Loop *TheLoop; 1434 1435 /// Predicated scalar evolution analysis. 1436 PredicatedScalarEvolution &PSE; 1437 1438 /// Loop Info analysis. 1439 LoopInfo *LI; 1440 1441 /// Vectorization legality. 1442 LoopVectorizationLegality *Legal; 1443 1444 /// Vector target information. 1445 const TargetTransformInfo &TTI; 1446 1447 /// Target Library Info. 1448 const TargetLibraryInfo *TLI; 1449 1450 /// Demanded bits analysis. 1451 DemandedBits *DB; 1452 1453 /// Assumption cache. 1454 AssumptionCache *AC; 1455 1456 /// Interface to emit optimization remarks. 1457 OptimizationRemarkEmitter *ORE; 1458 1459 const Function *TheFunction; 1460 1461 /// Loop Vectorize Hint. 1462 const LoopVectorizeHints *Hints; 1463 1464 /// The interleave access information contains groups of interleaved accesses 1465 /// with the same stride and close to each other. 1466 InterleavedAccessInfo &InterleaveInfo; 1467 1468 /// Values to ignore in the cost model. 1469 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1470 1471 /// Values to ignore in the cost model when VF > 1. 1472 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1473 }; 1474 1475 } // end namespace llvm 1476 1477 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1478 // vectorization. The loop needs to be annotated with #pragma omp simd 1479 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1480 // vector length information is not provided, vectorization is not considered 1481 // explicit. Interleave hints are not allowed either. These limitations will be 1482 // relaxed in the future. 1483 // Please, note that we are currently forced to abuse the pragma 'clang 1484 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1485 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1486 // provides *explicit vectorization hints* (LV can bypass legal checks and 1487 // assume that vectorization is legal). However, both hints are implemented 1488 // using the same metadata (llvm.loop.vectorize, processed by 1489 // LoopVectorizeHints). This will be fixed in the future when the native IR 1490 // representation for pragma 'omp simd' is introduced. 1491 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1492 OptimizationRemarkEmitter *ORE) { 1493 assert(!OuterLp->empty() && "This is not an outer loop"); 1494 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1495 1496 // Only outer loops with an explicit vectorization hint are supported. 1497 // Unannotated outer loops are ignored. 1498 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1499 return false; 1500 1501 Function *Fn = OuterLp->getHeader()->getParent(); 1502 if (!Hints.allowVectorization(Fn, OuterLp, 1503 true /*VectorizeOnlyWhenForced*/)) { 1504 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1505 return false; 1506 } 1507 1508 if (Hints.getInterleave() > 1) { 1509 // TODO: Interleave support is future work. 1510 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1511 "outer loops.\n"); 1512 Hints.emitRemarkWithHints(); 1513 return false; 1514 } 1515 1516 return true; 1517 } 1518 1519 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1520 OptimizationRemarkEmitter *ORE, 1521 SmallVectorImpl<Loop *> &V) { 1522 // Collect inner loops and outer loops without irreducible control flow. For 1523 // now, only collect outer loops that have explicit vectorization hints. If we 1524 // are stress testing the VPlan H-CFG construction, we collect the outermost 1525 // loop of every loop nest. 1526 if (L.empty() || VPlanBuildStressTest || 1527 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1528 LoopBlocksRPO RPOT(&L); 1529 RPOT.perform(LI); 1530 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1531 V.push_back(&L); 1532 // TODO: Collect inner loops inside marked outer loops in case 1533 // vectorization fails for the outer loop. Do not invoke 1534 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1535 // already known to be reducible. We can use an inherited attribute for 1536 // that. 1537 return; 1538 } 1539 } 1540 for (Loop *InnerL : L) 1541 collectSupportedLoops(*InnerL, LI, ORE, V); 1542 } 1543 1544 namespace { 1545 1546 /// The LoopVectorize Pass. 1547 struct LoopVectorize : public FunctionPass { 1548 /// Pass identification, replacement for typeid 1549 static char ID; 1550 1551 LoopVectorizePass Impl; 1552 1553 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1554 bool VectorizeOnlyWhenForced = false) 1555 : FunctionPass(ID) { 1556 Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; 1557 Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; 1558 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1559 } 1560 1561 bool runOnFunction(Function &F) override { 1562 if (skipFunction(F)) 1563 return false; 1564 1565 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1566 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1567 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1568 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1569 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1570 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1571 auto *TLI = TLIP ? &TLIP->getTLI() : nullptr; 1572 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1573 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1574 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1575 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1576 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1577 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1578 1579 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1580 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1581 1582 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1583 GetLAA, *ORE, PSI); 1584 } 1585 1586 void getAnalysisUsage(AnalysisUsage &AU) const override { 1587 AU.addRequired<AssumptionCacheTracker>(); 1588 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1589 AU.addRequired<DominatorTreeWrapperPass>(); 1590 AU.addRequired<LoopInfoWrapperPass>(); 1591 AU.addRequired<ScalarEvolutionWrapperPass>(); 1592 AU.addRequired<TargetTransformInfoWrapperPass>(); 1593 AU.addRequired<AAResultsWrapperPass>(); 1594 AU.addRequired<LoopAccessLegacyAnalysis>(); 1595 AU.addRequired<DemandedBitsWrapperPass>(); 1596 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1597 1598 // We currently do not preserve loopinfo/dominator analyses with outer loop 1599 // vectorization. Until this is addressed, mark these analyses as preserved 1600 // only for non-VPlan-native path. 1601 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1602 if (!EnableVPlanNativePath) { 1603 AU.addPreserved<LoopInfoWrapperPass>(); 1604 AU.addPreserved<DominatorTreeWrapperPass>(); 1605 } 1606 1607 AU.addPreserved<BasicAAWrapperPass>(); 1608 AU.addPreserved<GlobalsAAWrapperPass>(); 1609 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1610 } 1611 }; 1612 1613 } // end anonymous namespace 1614 1615 //===----------------------------------------------------------------------===// 1616 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1617 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1618 //===----------------------------------------------------------------------===// 1619 1620 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1621 // We need to place the broadcast of invariant variables outside the loop, 1622 // but only if it's proven safe to do so. Else, broadcast will be inside 1623 // vector loop body. 1624 Instruction *Instr = dyn_cast<Instruction>(V); 1625 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1626 (!Instr || 1627 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1628 // Place the code for broadcasting invariant variables in the new preheader. 1629 IRBuilder<>::InsertPointGuard Guard(Builder); 1630 if (SafeToHoist) 1631 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1632 1633 // Broadcast the scalar into all locations in the vector. 1634 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1635 1636 return Shuf; 1637 } 1638 1639 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1640 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1641 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1642 "Expected either an induction phi-node or a truncate of it!"); 1643 Value *Start = II.getStartValue(); 1644 1645 // Construct the initial value of the vector IV in the vector loop preheader 1646 auto CurrIP = Builder.saveIP(); 1647 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1648 if (isa<TruncInst>(EntryVal)) { 1649 assert(Start->getType()->isIntegerTy() && 1650 "Truncation requires an integer type"); 1651 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1652 Step = Builder.CreateTrunc(Step, TruncType); 1653 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1654 } 1655 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1656 Value *SteppedStart = 1657 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1658 1659 // We create vector phi nodes for both integer and floating-point induction 1660 // variables. Here, we determine the kind of arithmetic we will perform. 1661 Instruction::BinaryOps AddOp; 1662 Instruction::BinaryOps MulOp; 1663 if (Step->getType()->isIntegerTy()) { 1664 AddOp = Instruction::Add; 1665 MulOp = Instruction::Mul; 1666 } else { 1667 AddOp = II.getInductionOpcode(); 1668 MulOp = Instruction::FMul; 1669 } 1670 1671 // Multiply the vectorization factor by the step using integer or 1672 // floating-point arithmetic as appropriate. 1673 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1674 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1675 1676 // Create a vector splat to use in the induction update. 1677 // 1678 // FIXME: If the step is non-constant, we create the vector splat with 1679 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1680 // handle a constant vector splat. 1681 Value *SplatVF = isa<Constant>(Mul) 1682 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 1683 : Builder.CreateVectorSplat(VF, Mul); 1684 Builder.restoreIP(CurrIP); 1685 1686 // We may need to add the step a number of times, depending on the unroll 1687 // factor. The last of those goes into the PHI. 1688 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1689 &*LoopVectorBody->getFirstInsertionPt()); 1690 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1691 Instruction *LastInduction = VecInd; 1692 for (unsigned Part = 0; Part < UF; ++Part) { 1693 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1694 1695 if (isa<TruncInst>(EntryVal)) 1696 addMetadata(LastInduction, EntryVal); 1697 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1698 1699 LastInduction = cast<Instruction>(addFastMathFlag( 1700 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1701 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1702 } 1703 1704 // Move the last step to the end of the latch block. This ensures consistent 1705 // placement of all induction updates. 1706 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1707 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1708 auto *ICmp = cast<Instruction>(Br->getCondition()); 1709 LastInduction->moveBefore(ICmp); 1710 LastInduction->setName("vec.ind.next"); 1711 1712 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1713 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1714 } 1715 1716 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1717 return Cost->isScalarAfterVectorization(I, VF) || 1718 Cost->isProfitableToScalarize(I, VF); 1719 } 1720 1721 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1722 if (shouldScalarizeInstruction(IV)) 1723 return true; 1724 auto isScalarInst = [&](User *U) -> bool { 1725 auto *I = cast<Instruction>(U); 1726 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1727 }; 1728 return llvm::any_of(IV->users(), isScalarInst); 1729 } 1730 1731 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1732 const InductionDescriptor &ID, const Instruction *EntryVal, 1733 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1734 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1735 "Expected either an induction phi-node or a truncate of it!"); 1736 1737 // This induction variable is not the phi from the original loop but the 1738 // newly-created IV based on the proof that casted Phi is equal to the 1739 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1740 // re-uses the same InductionDescriptor that original IV uses but we don't 1741 // have to do any recording in this case - that is done when original IV is 1742 // processed. 1743 if (isa<TruncInst>(EntryVal)) 1744 return; 1745 1746 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1747 if (Casts.empty()) 1748 return; 1749 // Only the first Cast instruction in the Casts vector is of interest. 1750 // The rest of the Casts (if exist) have no uses outside the 1751 // induction update chain itself. 1752 Instruction *CastInst = *Casts.begin(); 1753 if (Lane < UINT_MAX) 1754 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1755 else 1756 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1757 } 1758 1759 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1760 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1761 "Primary induction variable must have an integer type"); 1762 1763 auto II = Legal->getInductionVars()->find(IV); 1764 assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); 1765 1766 auto ID = II->second; 1767 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1768 1769 // The scalar value to broadcast. This will be derived from the canonical 1770 // induction variable. 1771 Value *ScalarIV = nullptr; 1772 1773 // The value from the original loop to which we are mapping the new induction 1774 // variable. 1775 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1776 1777 // True if we have vectorized the induction variable. 1778 auto VectorizedIV = false; 1779 1780 // Determine if we want a scalar version of the induction variable. This is 1781 // true if the induction variable itself is not widened, or if it has at 1782 // least one user in the loop that is not widened. 1783 auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); 1784 1785 // Generate code for the induction step. Note that induction steps are 1786 // required to be loop-invariant 1787 assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && 1788 "Induction step should be loop invariant"); 1789 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1790 Value *Step = nullptr; 1791 if (PSE.getSE()->isSCEVable(IV->getType())) { 1792 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1793 Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), 1794 LoopVectorPreHeader->getTerminator()); 1795 } else { 1796 Step = cast<SCEVUnknown>(ID.getStep())->getValue(); 1797 } 1798 1799 // Try to create a new independent vector induction variable. If we can't 1800 // create the phi node, we will splat the scalar induction variable in each 1801 // loop iteration. 1802 if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) { 1803 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1804 VectorizedIV = true; 1805 } 1806 1807 // If we haven't yet vectorized the induction variable, or if we will create 1808 // a scalar one, we need to define the scalar induction variable and step 1809 // values. If we were given a truncation type, truncate the canonical 1810 // induction variable and step. Otherwise, derive these values from the 1811 // induction descriptor. 1812 if (!VectorizedIV || NeedsScalarIV) { 1813 ScalarIV = Induction; 1814 if (IV != OldInduction) { 1815 ScalarIV = IV->getType()->isIntegerTy() 1816 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1817 : Builder.CreateCast(Instruction::SIToFP, Induction, 1818 IV->getType()); 1819 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1820 ScalarIV->setName("offset.idx"); 1821 } 1822 if (Trunc) { 1823 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1824 assert(Step->getType()->isIntegerTy() && 1825 "Truncation requires an integer step"); 1826 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1827 Step = Builder.CreateTrunc(Step, TruncType); 1828 } 1829 } 1830 1831 // If we haven't yet vectorized the induction variable, splat the scalar 1832 // induction variable, and build the necessary step vectors. 1833 // TODO: Don't do it unless the vectorized IV is really required. 1834 if (!VectorizedIV) { 1835 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1836 for (unsigned Part = 0; Part < UF; ++Part) { 1837 Value *EntryPart = 1838 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1839 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1840 if (Trunc) 1841 addMetadata(EntryPart, Trunc); 1842 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1843 } 1844 } 1845 1846 // If an induction variable is only used for counting loop iterations or 1847 // calculating addresses, it doesn't need to be widened. Create scalar steps 1848 // that can be used by instructions we will later scalarize. Note that the 1849 // addition of the scalar steps will not increase the number of instructions 1850 // in the loop in the common case prior to InstCombine. We will be trading 1851 // one vector extract for each scalar step. 1852 if (NeedsScalarIV) 1853 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1854 } 1855 1856 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1857 Instruction::BinaryOps BinOp) { 1858 // Create and check the types. 1859 assert(Val->getType()->isVectorTy() && "Must be a vector"); 1860 int VLen = Val->getType()->getVectorNumElements(); 1861 1862 Type *STy = Val->getType()->getScalarType(); 1863 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1864 "Induction Step must be an integer or FP"); 1865 assert(Step->getType() == STy && "Step has wrong type"); 1866 1867 SmallVector<Constant *, 8> Indices; 1868 1869 if (STy->isIntegerTy()) { 1870 // Create a vector of consecutive numbers from zero to VF. 1871 for (int i = 0; i < VLen; ++i) 1872 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1873 1874 // Add the consecutive indices to the vector value. 1875 Constant *Cv = ConstantVector::get(Indices); 1876 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1877 Step = Builder.CreateVectorSplat(VLen, Step); 1878 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1879 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1880 // which can be found from the original scalar operations. 1881 Step = Builder.CreateMul(Cv, Step); 1882 return Builder.CreateAdd(Val, Step, "induction"); 1883 } 1884 1885 // Floating point induction. 1886 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1887 "Binary Opcode should be specified for FP induction"); 1888 // Create a vector of consecutive numbers from zero to VF. 1889 for (int i = 0; i < VLen; ++i) 1890 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1891 1892 // Add the consecutive indices to the vector value. 1893 Constant *Cv = ConstantVector::get(Indices); 1894 1895 Step = Builder.CreateVectorSplat(VLen, Step); 1896 1897 // Floating point operations had to be 'fast' to enable the induction. 1898 FastMathFlags Flags; 1899 Flags.setFast(); 1900 1901 Value *MulOp = Builder.CreateFMul(Cv, Step); 1902 if (isa<Instruction>(MulOp)) 1903 // Have to check, MulOp may be a constant 1904 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1905 1906 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1907 if (isa<Instruction>(BOp)) 1908 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1909 return BOp; 1910 } 1911 1912 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1913 Instruction *EntryVal, 1914 const InductionDescriptor &ID) { 1915 // We shouldn't have to build scalar steps if we aren't vectorizing. 1916 assert(VF > 1 && "VF should be greater than one"); 1917 1918 // Get the value type and ensure it and the step have the same integer type. 1919 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1920 assert(ScalarIVTy == Step->getType() && 1921 "Val and Step should have the same type"); 1922 1923 // We build scalar steps for both integer and floating-point induction 1924 // variables. Here, we determine the kind of arithmetic we will perform. 1925 Instruction::BinaryOps AddOp; 1926 Instruction::BinaryOps MulOp; 1927 if (ScalarIVTy->isIntegerTy()) { 1928 AddOp = Instruction::Add; 1929 MulOp = Instruction::Mul; 1930 } else { 1931 AddOp = ID.getInductionOpcode(); 1932 MulOp = Instruction::FMul; 1933 } 1934 1935 // Determine the number of scalars we need to generate for each unroll 1936 // iteration. If EntryVal is uniform, we only need to generate the first 1937 // lane. Otherwise, we generate all VF values. 1938 unsigned Lanes = 1939 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 1940 : VF; 1941 // Compute the scalar steps and save the results in VectorLoopValueMap. 1942 for (unsigned Part = 0; Part < UF; ++Part) { 1943 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 1944 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 1945 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 1946 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 1947 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 1948 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 1949 } 1950 } 1951 } 1952 1953 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 1954 assert(V != Induction && "The new induction variable should not be used."); 1955 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 1956 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 1957 1958 // If we have a stride that is replaced by one, do it here. Defer this for 1959 // the VPlan-native path until we start running Legal checks in that path. 1960 if (!EnableVPlanNativePath && Legal->hasStride(V)) 1961 V = ConstantInt::get(V->getType(), 1); 1962 1963 // If we have a vector mapped to this value, return it. 1964 if (VectorLoopValueMap.hasVectorValue(V, Part)) 1965 return VectorLoopValueMap.getVectorValue(V, Part); 1966 1967 // If the value has not been vectorized, check if it has been scalarized 1968 // instead. If it has been scalarized, and we actually need the value in 1969 // vector form, we will construct the vector values on demand. 1970 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 1971 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 1972 1973 // If we've scalarized a value, that value should be an instruction. 1974 auto *I = cast<Instruction>(V); 1975 1976 // If we aren't vectorizing, we can just copy the scalar map values over to 1977 // the vector map. 1978 if (VF == 1) { 1979 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 1980 return ScalarValue; 1981 } 1982 1983 // Get the last scalar instruction we generated for V and Part. If the value 1984 // is known to be uniform after vectorization, this corresponds to lane zero 1985 // of the Part unroll iteration. Otherwise, the last instruction is the one 1986 // we created for the last vector lane of the Part unroll iteration. 1987 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 1988 auto *LastInst = cast<Instruction>( 1989 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 1990 1991 // Set the insert point after the last scalarized instruction. This ensures 1992 // the insertelement sequence will directly follow the scalar definitions. 1993 auto OldIP = Builder.saveIP(); 1994 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 1995 Builder.SetInsertPoint(&*NewIP); 1996 1997 // However, if we are vectorizing, we need to construct the vector values. 1998 // If the value is known to be uniform after vectorization, we can just 1999 // broadcast the scalar value corresponding to lane zero for each unroll 2000 // iteration. Otherwise, we construct the vector values using insertelement 2001 // instructions. Since the resulting vectors are stored in 2002 // VectorLoopValueMap, we will only generate the insertelements once. 2003 Value *VectorValue = nullptr; 2004 if (Cost->isUniformAfterVectorization(I, VF)) { 2005 VectorValue = getBroadcastInstrs(ScalarValue); 2006 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2007 } else { 2008 // Initialize packing with insertelements to start from undef. 2009 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2010 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2011 for (unsigned Lane = 0; Lane < VF; ++Lane) 2012 packScalarIntoVectorValue(V, {Part, Lane}); 2013 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2014 } 2015 Builder.restoreIP(OldIP); 2016 return VectorValue; 2017 } 2018 2019 // If this scalar is unknown, assume that it is a constant or that it is 2020 // loop invariant. Broadcast V and save the value for future uses. 2021 Value *B = getBroadcastInstrs(V); 2022 VectorLoopValueMap.setVectorValue(V, Part, B); 2023 return B; 2024 } 2025 2026 Value * 2027 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2028 const VPIteration &Instance) { 2029 // If the value is not an instruction contained in the loop, it should 2030 // already be scalar. 2031 if (OrigLoop->isLoopInvariant(V)) 2032 return V; 2033 2034 assert(Instance.Lane > 0 2035 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2036 : true && "Uniform values only have lane zero"); 2037 2038 // If the value from the original loop has not been vectorized, it is 2039 // represented by UF x VF scalar values in the new loop. Return the requested 2040 // scalar value. 2041 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2042 return VectorLoopValueMap.getScalarValue(V, Instance); 2043 2044 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2045 // for the given unroll part. If this entry is not a vector type (i.e., the 2046 // vectorization factor is one), there is no need to generate an 2047 // extractelement instruction. 2048 auto *U = getOrCreateVectorValue(V, Instance.Part); 2049 if (!U->getType()->isVectorTy()) { 2050 assert(VF == 1 && "Value not scalarized has non-vector type"); 2051 return U; 2052 } 2053 2054 // Otherwise, the value from the original loop has been vectorized and is 2055 // represented by UF vector values. Extract and return the requested scalar 2056 // value from the appropriate vector lane. 2057 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2058 } 2059 2060 void InnerLoopVectorizer::packScalarIntoVectorValue( 2061 Value *V, const VPIteration &Instance) { 2062 assert(V != Induction && "The new induction variable should not be used."); 2063 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2064 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2065 2066 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2067 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2068 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2069 Builder.getInt32(Instance.Lane)); 2070 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2071 } 2072 2073 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2074 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2075 SmallVector<Constant *, 8> ShuffleMask; 2076 for (unsigned i = 0; i < VF; ++i) 2077 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 2078 2079 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2080 ConstantVector::get(ShuffleMask), 2081 "reverse"); 2082 } 2083 2084 // Return whether we allow using masked interleave-groups (for dealing with 2085 // strided loads/stores that reside in predicated blocks, or for dealing 2086 // with gaps). 2087 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2088 // If an override option has been passed in for interleaved accesses, use it. 2089 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2090 return EnableMaskedInterleavedMemAccesses; 2091 2092 return TTI.enableMaskedInterleavedAccessVectorization(); 2093 } 2094 2095 // Try to vectorize the interleave group that \p Instr belongs to. 2096 // 2097 // E.g. Translate following interleaved load group (factor = 3): 2098 // for (i = 0; i < N; i+=3) { 2099 // R = Pic[i]; // Member of index 0 2100 // G = Pic[i+1]; // Member of index 1 2101 // B = Pic[i+2]; // Member of index 2 2102 // ... // do something to R, G, B 2103 // } 2104 // To: 2105 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2106 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2107 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2108 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2109 // 2110 // Or translate following interleaved store group (factor = 3): 2111 // for (i = 0; i < N; i+=3) { 2112 // ... do something to R, G, B 2113 // Pic[i] = R; // Member of index 0 2114 // Pic[i+1] = G; // Member of index 1 2115 // Pic[i+2] = B; // Member of index 2 2116 // } 2117 // To: 2118 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2119 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2120 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2121 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2122 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2123 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, 2124 VectorParts *BlockInMask) { 2125 const InterleaveGroup<Instruction> *Group = 2126 Cost->getInterleavedAccessGroup(Instr); 2127 assert(Group && "Fail to get an interleaved access group."); 2128 2129 // Skip if current instruction is not the insert position. 2130 if (Instr != Group->getInsertPos()) 2131 return; 2132 2133 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2134 Value *Ptr = getLoadStorePointerOperand(Instr); 2135 2136 // Prepare for the vector type of the interleaved load/store. 2137 Type *ScalarTy = getMemInstValueType(Instr); 2138 unsigned InterleaveFactor = Group->getFactor(); 2139 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2140 Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr)); 2141 2142 // Prepare for the new pointers. 2143 setDebugLocFromInst(Builder, Ptr); 2144 SmallVector<Value *, 2> NewPtrs; 2145 unsigned Index = Group->getIndex(Instr); 2146 2147 VectorParts Mask; 2148 bool IsMaskForCondRequired = BlockInMask; 2149 if (IsMaskForCondRequired) { 2150 Mask = *BlockInMask; 2151 // TODO: extend the masked interleaved-group support to reversed access. 2152 assert(!Group->isReverse() && "Reversed masked interleave-group " 2153 "not supported."); 2154 } 2155 2156 // If the group is reverse, adjust the index to refer to the last vector lane 2157 // instead of the first. We adjust the index from the first vector lane, 2158 // rather than directly getting the pointer for lane VF - 1, because the 2159 // pointer operand of the interleaved access is supposed to be uniform. For 2160 // uniform instructions, we're only required to generate a value for the 2161 // first vector lane in each unroll iteration. 2162 if (Group->isReverse()) 2163 Index += (VF - 1) * Group->getFactor(); 2164 2165 bool InBounds = false; 2166 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2167 InBounds = gep->isInBounds(); 2168 2169 for (unsigned Part = 0; Part < UF; Part++) { 2170 Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); 2171 2172 // Notice current instruction could be any index. Need to adjust the address 2173 // to the member of index 0. 2174 // 2175 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2176 // b = A[i]; // Member of index 0 2177 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2178 // 2179 // E.g. A[i+1] = a; // Member of index 1 2180 // A[i] = b; // Member of index 0 2181 // A[i+2] = c; // Member of index 2 (Current instruction) 2182 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2183 NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index)); 2184 if (InBounds) 2185 cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true); 2186 2187 // Cast to the vector pointer type. 2188 NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); 2189 } 2190 2191 setDebugLocFromInst(Builder, Instr); 2192 Value *UndefVec = UndefValue::get(VecTy); 2193 2194 Value *MaskForGaps = nullptr; 2195 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2196 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2197 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2198 } 2199 2200 // Vectorize the interleaved load group. 2201 if (isa<LoadInst>(Instr)) { 2202 // For each unroll part, create a wide load for the group. 2203 SmallVector<Value *, 2> NewLoads; 2204 for (unsigned Part = 0; Part < UF; Part++) { 2205 Instruction *NewLoad; 2206 if (IsMaskForCondRequired || MaskForGaps) { 2207 assert(useMaskedInterleavedAccesses(*TTI) && 2208 "masked interleaved groups are not allowed."); 2209 Value *GroupMask = MaskForGaps; 2210 if (IsMaskForCondRequired) { 2211 auto *Undefs = UndefValue::get(Mask[Part]->getType()); 2212 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2213 Value *ShuffledMask = Builder.CreateShuffleVector( 2214 Mask[Part], Undefs, RepMask, "interleaved.mask"); 2215 GroupMask = MaskForGaps 2216 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2217 MaskForGaps) 2218 : ShuffledMask; 2219 } 2220 NewLoad = 2221 Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), 2222 GroupMask, UndefVec, "wide.masked.vec"); 2223 } 2224 else 2225 NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part], 2226 Group->getAlignment(), "wide.vec"); 2227 Group->addMetadata(NewLoad); 2228 NewLoads.push_back(NewLoad); 2229 } 2230 2231 // For each member in the group, shuffle out the appropriate data from the 2232 // wide loads. 2233 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2234 Instruction *Member = Group->getMember(I); 2235 2236 // Skip the gaps in the group. 2237 if (!Member) 2238 continue; 2239 2240 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); 2241 for (unsigned Part = 0; Part < UF; Part++) { 2242 Value *StridedVec = Builder.CreateShuffleVector( 2243 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2244 2245 // If this member has different type, cast the result type. 2246 if (Member->getType() != ScalarTy) { 2247 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2248 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2249 } 2250 2251 if (Group->isReverse()) 2252 StridedVec = reverseVector(StridedVec); 2253 2254 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2255 } 2256 } 2257 return; 2258 } 2259 2260 // The sub vector type for current instruction. 2261 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2262 2263 // Vectorize the interleaved store group. 2264 for (unsigned Part = 0; Part < UF; Part++) { 2265 // Collect the stored vector from each member. 2266 SmallVector<Value *, 4> StoredVecs; 2267 for (unsigned i = 0; i < InterleaveFactor; i++) { 2268 // Interleaved store group doesn't allow a gap, so each index has a member 2269 Instruction *Member = Group->getMember(i); 2270 assert(Member && "Fail to get a member from an interleaved store group"); 2271 2272 Value *StoredVec = getOrCreateVectorValue( 2273 cast<StoreInst>(Member)->getValueOperand(), Part); 2274 if (Group->isReverse()) 2275 StoredVec = reverseVector(StoredVec); 2276 2277 // If this member has different type, cast it to a unified type. 2278 2279 if (StoredVec->getType() != SubVT) 2280 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2281 2282 StoredVecs.push_back(StoredVec); 2283 } 2284 2285 // Concatenate all vectors into a wide vector. 2286 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2287 2288 // Interleave the elements in the wide vector. 2289 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); 2290 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, 2291 "interleaved.vec"); 2292 2293 Instruction *NewStoreInstr; 2294 if (IsMaskForCondRequired) { 2295 auto *Undefs = UndefValue::get(Mask[Part]->getType()); 2296 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2297 Value *ShuffledMask = Builder.CreateShuffleVector( 2298 Mask[Part], Undefs, RepMask, "interleaved.mask"); 2299 NewStoreInstr = Builder.CreateMaskedStore( 2300 IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask); 2301 } 2302 else 2303 NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], 2304 Group->getAlignment()); 2305 2306 Group->addMetadata(NewStoreInstr); 2307 } 2308 } 2309 2310 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2311 VectorParts *BlockInMask) { 2312 // Attempt to issue a wide load. 2313 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2314 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2315 2316 assert((LI || SI) && "Invalid Load/Store instruction"); 2317 2318 LoopVectorizationCostModel::InstWidening Decision = 2319 Cost->getWideningDecision(Instr, VF); 2320 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 2321 "CM decision should be taken at this point"); 2322 if (Decision == LoopVectorizationCostModel::CM_Interleave) 2323 return vectorizeInterleaveGroup(Instr); 2324 2325 Type *ScalarDataTy = getMemInstValueType(Instr); 2326 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2327 Value *Ptr = getLoadStorePointerOperand(Instr); 2328 unsigned Alignment = getLoadStoreAlignment(Instr); 2329 // An alignment of 0 means target abi alignment. We need to use the scalar's 2330 // target abi alignment in such a case. 2331 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2332 if (!Alignment) 2333 Alignment = DL.getABITypeAlignment(ScalarDataTy); 2334 unsigned AddressSpace = getLoadStoreAddressSpace(Instr); 2335 2336 // Determine if the pointer operand of the access is either consecutive or 2337 // reverse consecutive. 2338 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2339 bool ConsecutiveStride = 2340 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2341 bool CreateGatherScatter = 2342 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2343 2344 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2345 // gather/scatter. Otherwise Decision should have been to Scalarize. 2346 assert((ConsecutiveStride || CreateGatherScatter) && 2347 "The instruction should be scalarized"); 2348 2349 // Handle consecutive loads/stores. 2350 if (ConsecutiveStride) 2351 Ptr = getOrCreateScalarValue(Ptr, {0, 0}); 2352 2353 VectorParts Mask; 2354 bool isMaskRequired = BlockInMask; 2355 if (isMaskRequired) 2356 Mask = *BlockInMask; 2357 2358 bool InBounds = false; 2359 if (auto *gep = dyn_cast<GetElementPtrInst>( 2360 getLoadStorePointerOperand(Instr)->stripPointerCasts())) 2361 InBounds = gep->isInBounds(); 2362 2363 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2364 // Calculate the pointer for the specific unroll-part. 2365 GetElementPtrInst *PartPtr = nullptr; 2366 2367 if (Reverse) { 2368 // If the address is consecutive but reversed, then the 2369 // wide store needs to start at the last vector element. 2370 PartPtr = cast<GetElementPtrInst>( 2371 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2372 PartPtr->setIsInBounds(InBounds); 2373 PartPtr = cast<GetElementPtrInst>( 2374 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2375 PartPtr->setIsInBounds(InBounds); 2376 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2377 Mask[Part] = reverseVector(Mask[Part]); 2378 } else { 2379 PartPtr = cast<GetElementPtrInst>( 2380 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2381 PartPtr->setIsInBounds(InBounds); 2382 } 2383 2384 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2385 }; 2386 2387 // Handle Stores: 2388 if (SI) { 2389 setDebugLocFromInst(Builder, SI); 2390 2391 for (unsigned Part = 0; Part < UF; ++Part) { 2392 Instruction *NewSI = nullptr; 2393 Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); 2394 if (CreateGatherScatter) { 2395 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; 2396 Value *VectorGep = getOrCreateVectorValue(Ptr, Part); 2397 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2398 MaskPart); 2399 } else { 2400 if (Reverse) { 2401 // If we store to reverse consecutive memory locations, then we need 2402 // to reverse the order of elements in the stored value. 2403 StoredVal = reverseVector(StoredVal); 2404 // We don't want to update the value in the map as it might be used in 2405 // another expression. So don't call resetVectorValue(StoredVal). 2406 } 2407 auto *VecPtr = CreateVecPtr(Part, Ptr); 2408 if (isMaskRequired) 2409 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2410 Mask[Part]); 2411 else 2412 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2413 } 2414 addMetadata(NewSI, SI); 2415 } 2416 return; 2417 } 2418 2419 // Handle loads. 2420 assert(LI && "Must have a load instruction"); 2421 setDebugLocFromInst(Builder, LI); 2422 for (unsigned Part = 0; Part < UF; ++Part) { 2423 Value *NewLI; 2424 if (CreateGatherScatter) { 2425 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; 2426 Value *VectorGep = getOrCreateVectorValue(Ptr, Part); 2427 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2428 nullptr, "wide.masked.gather"); 2429 addMetadata(NewLI, LI); 2430 } else { 2431 auto *VecPtr = CreateVecPtr(Part, Ptr); 2432 if (isMaskRequired) 2433 NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], 2434 UndefValue::get(DataTy), 2435 "wide.masked.load"); 2436 else 2437 NewLI = 2438 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2439 2440 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2441 addMetadata(NewLI, LI); 2442 if (Reverse) 2443 NewLI = reverseVector(NewLI); 2444 } 2445 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2446 } 2447 } 2448 2449 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2450 const VPIteration &Instance, 2451 bool IfPredicateInstr) { 2452 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2453 2454 setDebugLocFromInst(Builder, Instr); 2455 2456 // Does this instruction return a value ? 2457 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2458 2459 Instruction *Cloned = Instr->clone(); 2460 if (!IsVoidRetTy) 2461 Cloned->setName(Instr->getName() + ".cloned"); 2462 2463 // Replace the operands of the cloned instructions with their scalar 2464 // equivalents in the new loop. 2465 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2466 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); 2467 Cloned->setOperand(op, NewOp); 2468 } 2469 addNewMetadata(Cloned, Instr); 2470 2471 // Place the cloned scalar in the new loop. 2472 Builder.Insert(Cloned); 2473 2474 // Add the cloned scalar to the scalar map entry. 2475 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2476 2477 // If we just cloned a new assumption, add it the assumption cache. 2478 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2479 if (II->getIntrinsicID() == Intrinsic::assume) 2480 AC->registerAssumption(II); 2481 2482 // End if-block. 2483 if (IfPredicateInstr) 2484 PredicatedInstructions.push_back(Cloned); 2485 } 2486 2487 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2488 Value *End, Value *Step, 2489 Instruction *DL) { 2490 BasicBlock *Header = L->getHeader(); 2491 BasicBlock *Latch = L->getLoopLatch(); 2492 // As we're just creating this loop, it's possible no latch exists 2493 // yet. If so, use the header as this will be a single block loop. 2494 if (!Latch) 2495 Latch = Header; 2496 2497 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2498 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2499 setDebugLocFromInst(Builder, OldInst); 2500 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2501 2502 Builder.SetInsertPoint(Latch->getTerminator()); 2503 setDebugLocFromInst(Builder, OldInst); 2504 2505 // Create i+1 and fill the PHINode. 2506 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2507 Induction->addIncoming(Start, L->getLoopPreheader()); 2508 Induction->addIncoming(Next, Latch); 2509 // Create the compare. 2510 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2511 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2512 2513 // Now we have two terminators. Remove the old one from the block. 2514 Latch->getTerminator()->eraseFromParent(); 2515 2516 return Induction; 2517 } 2518 2519 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2520 if (TripCount) 2521 return TripCount; 2522 2523 assert(L && "Create Trip Count for null loop."); 2524 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2525 // Find the loop boundaries. 2526 ScalarEvolution *SE = PSE.getSE(); 2527 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2528 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2529 "Invalid loop count"); 2530 2531 Type *IdxTy = Legal->getWidestInductionType(); 2532 assert(IdxTy && "No type for induction"); 2533 2534 // The exit count might have the type of i64 while the phi is i32. This can 2535 // happen if we have an induction variable that is sign extended before the 2536 // compare. The only way that we get a backedge taken count is that the 2537 // induction variable was signed and as such will not overflow. In such a case 2538 // truncation is legal. 2539 if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > 2540 IdxTy->getPrimitiveSizeInBits()) 2541 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2542 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2543 2544 // Get the total trip count from the count by adding 1. 2545 const SCEV *ExitCount = SE->getAddExpr( 2546 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2547 2548 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2549 2550 // Expand the trip count and place the new instructions in the preheader. 2551 // Notice that the pre-header does not change, only the loop body. 2552 SCEVExpander Exp(*SE, DL, "induction"); 2553 2554 // Count holds the overall loop count (N). 2555 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2556 L->getLoopPreheader()->getTerminator()); 2557 2558 if (TripCount->getType()->isPointerTy()) 2559 TripCount = 2560 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2561 L->getLoopPreheader()->getTerminator()); 2562 2563 return TripCount; 2564 } 2565 2566 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2567 if (VectorTripCount) 2568 return VectorTripCount; 2569 2570 Value *TC = getOrCreateTripCount(L); 2571 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2572 2573 Type *Ty = TC->getType(); 2574 Constant *Step = ConstantInt::get(Ty, VF * UF); 2575 2576 // If the tail is to be folded by masking, round the number of iterations N 2577 // up to a multiple of Step instead of rounding down. This is done by first 2578 // adding Step-1 and then rounding down. Note that it's ok if this addition 2579 // overflows: the vector induction variable will eventually wrap to zero given 2580 // that it starts at zero and its Step is a power of two; the loop will then 2581 // exit, with the last early-exit vector comparison also producing all-true. 2582 if (Cost->foldTailByMasking()) { 2583 assert(isPowerOf2_32(VF * UF) && 2584 "VF*UF must be a power of 2 when folding tail by masking"); 2585 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2586 } 2587 2588 // Now we need to generate the expression for the part of the loop that the 2589 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2590 // iterations are not required for correctness, or N - Step, otherwise. Step 2591 // is equal to the vectorization factor (number of SIMD elements) times the 2592 // unroll factor (number of SIMD instructions). 2593 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2594 2595 // If there is a non-reversed interleaved group that may speculatively access 2596 // memory out-of-bounds, we need to ensure that there will be at least one 2597 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2598 // the trip count, we set the remainder to be equal to the step. If the step 2599 // does not evenly divide the trip count, no adjustment is necessary since 2600 // there will already be scalar iterations. Note that the minimum iterations 2601 // check ensures that N >= Step. 2602 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2603 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2604 R = Builder.CreateSelect(IsZero, Step, R); 2605 } 2606 2607 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2608 2609 return VectorTripCount; 2610 } 2611 2612 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2613 const DataLayout &DL) { 2614 // Verify that V is a vector type with same number of elements as DstVTy. 2615 unsigned VF = DstVTy->getNumElements(); 2616 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2617 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2618 Type *SrcElemTy = SrcVecTy->getElementType(); 2619 Type *DstElemTy = DstVTy->getElementType(); 2620 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2621 "Vector elements must have same size"); 2622 2623 // Do a direct cast if element types are castable. 2624 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2625 return Builder.CreateBitOrPointerCast(V, DstVTy); 2626 } 2627 // V cannot be directly casted to desired vector type. 2628 // May happen when V is a floating point vector but DstVTy is a vector of 2629 // pointers or vice-versa. Handle this using a two-step bitcast using an 2630 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2631 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2632 "Only one type should be a pointer type"); 2633 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2634 "Only one type should be a floating point type"); 2635 Type *IntTy = 2636 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2637 VectorType *VecIntTy = VectorType::get(IntTy, VF); 2638 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2639 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2640 } 2641 2642 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2643 BasicBlock *Bypass) { 2644 Value *Count = getOrCreateTripCount(L); 2645 BasicBlock *BB = L->getLoopPreheader(); 2646 IRBuilder<> Builder(BB->getTerminator()); 2647 2648 // Generate code to check if the loop's trip count is less than VF * UF, or 2649 // equal to it in case a scalar epilogue is required; this implies that the 2650 // vector trip count is zero. This check also covers the case where adding one 2651 // to the backedge-taken count overflowed leading to an incorrect trip count 2652 // of zero. In this case we will also jump to the scalar loop. 2653 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2654 : ICmpInst::ICMP_ULT; 2655 2656 // If tail is to be folded, vector loop takes care of all iterations. 2657 Value *CheckMinIters = Builder.getFalse(); 2658 if (!Cost->foldTailByMasking()) 2659 CheckMinIters = Builder.CreateICmp( 2660 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2661 "min.iters.check"); 2662 2663 BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2664 // Update dominator tree immediately if the generated block is a 2665 // LoopBypassBlock because SCEV expansions to generate loop bypass 2666 // checks may query it before the current function is finished. 2667 DT->addNewBlock(NewBB, BB); 2668 if (L->getParentLoop()) 2669 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2670 ReplaceInstWithInst(BB->getTerminator(), 2671 BranchInst::Create(Bypass, NewBB, CheckMinIters)); 2672 LoopBypassBlocks.push_back(BB); 2673 } 2674 2675 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2676 BasicBlock *BB = L->getLoopPreheader(); 2677 2678 // Generate the code to check that the SCEV assumptions that we made. 2679 // We want the new basic block to start at the first instruction in a 2680 // sequence of instructions that form a check. 2681 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2682 "scev.check"); 2683 Value *SCEVCheck = 2684 Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); 2685 2686 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2687 if (C->isZero()) 2688 return; 2689 2690 assert(!Cost->foldTailByMasking() && 2691 "Cannot SCEV check stride or overflow when folding tail"); 2692 // Create a new block containing the stride check. 2693 BB->setName("vector.scevcheck"); 2694 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2695 // Update dominator tree immediately if the generated block is a 2696 // LoopBypassBlock because SCEV expansions to generate loop bypass 2697 // checks may query it before the current function is finished. 2698 DT->addNewBlock(NewBB, BB); 2699 if (L->getParentLoop()) 2700 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2701 ReplaceInstWithInst(BB->getTerminator(), 2702 BranchInst::Create(Bypass, NewBB, SCEVCheck)); 2703 LoopBypassBlocks.push_back(BB); 2704 AddedSafetyChecks = true; 2705 } 2706 2707 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2708 // VPlan-native path does not do any analysis for runtime checks currently. 2709 if (EnableVPlanNativePath) 2710 return; 2711 2712 BasicBlock *BB = L->getLoopPreheader(); 2713 2714 // Generate the code that checks in runtime if arrays overlap. We put the 2715 // checks into a separate block to make the more common case of few elements 2716 // faster. 2717 Instruction *FirstCheckInst; 2718 Instruction *MemRuntimeCheck; 2719 std::tie(FirstCheckInst, MemRuntimeCheck) = 2720 Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); 2721 if (!MemRuntimeCheck) 2722 return; 2723 2724 assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail"); 2725 // Create a new block containing the memory check. 2726 BB->setName("vector.memcheck"); 2727 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2728 // Update dominator tree immediately if the generated block is a 2729 // LoopBypassBlock because SCEV expansions to generate loop bypass 2730 // checks may query it before the current function is finished. 2731 DT->addNewBlock(NewBB, BB); 2732 if (L->getParentLoop()) 2733 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2734 ReplaceInstWithInst(BB->getTerminator(), 2735 BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); 2736 LoopBypassBlocks.push_back(BB); 2737 AddedSafetyChecks = true; 2738 2739 // We currently don't use LoopVersioning for the actual loop cloning but we 2740 // still use it to add the noalias metadata. 2741 LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2742 PSE.getSE()); 2743 LVer->prepareNoAliasMetadata(); 2744 } 2745 2746 Value *InnerLoopVectorizer::emitTransformedIndex( 2747 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2748 const InductionDescriptor &ID) const { 2749 2750 SCEVExpander Exp(*SE, DL, "induction"); 2751 auto Step = ID.getStep(); 2752 auto StartValue = ID.getStartValue(); 2753 assert(Index->getType() == Step->getType() && 2754 "Index type does not match StepValue type"); 2755 2756 // Note: the IR at this point is broken. We cannot use SE to create any new 2757 // SCEV and then expand it, hoping that SCEV's simplification will give us 2758 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2759 // lead to various SCEV crashes. So all we can do is to use builder and rely 2760 // on InstCombine for future simplifications. Here we handle some trivial 2761 // cases only. 2762 auto CreateAdd = [&B](Value *X, Value *Y) { 2763 assert(X->getType() == Y->getType() && "Types don't match!"); 2764 if (auto *CX = dyn_cast<ConstantInt>(X)) 2765 if (CX->isZero()) 2766 return Y; 2767 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2768 if (CY->isZero()) 2769 return X; 2770 return B.CreateAdd(X, Y); 2771 }; 2772 2773 auto CreateMul = [&B](Value *X, Value *Y) { 2774 assert(X->getType() == Y->getType() && "Types don't match!"); 2775 if (auto *CX = dyn_cast<ConstantInt>(X)) 2776 if (CX->isOne()) 2777 return Y; 2778 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2779 if (CY->isOne()) 2780 return X; 2781 return B.CreateMul(X, Y); 2782 }; 2783 2784 switch (ID.getKind()) { 2785 case InductionDescriptor::IK_IntInduction: { 2786 assert(Index->getType() == StartValue->getType() && 2787 "Index type does not match StartValue type"); 2788 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2789 return B.CreateSub(StartValue, Index); 2790 auto *Offset = CreateMul( 2791 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2792 return CreateAdd(StartValue, Offset); 2793 } 2794 case InductionDescriptor::IK_PtrInduction: { 2795 assert(isa<SCEVConstant>(Step) && 2796 "Expected constant step for pointer induction"); 2797 return B.CreateGEP( 2798 StartValue->getType()->getPointerElementType(), StartValue, 2799 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2800 &*B.GetInsertPoint()))); 2801 } 2802 case InductionDescriptor::IK_FpInduction: { 2803 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2804 auto InductionBinOp = ID.getInductionBinOp(); 2805 assert(InductionBinOp && 2806 (InductionBinOp->getOpcode() == Instruction::FAdd || 2807 InductionBinOp->getOpcode() == Instruction::FSub) && 2808 "Original bin op should be defined for FP induction"); 2809 2810 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2811 2812 // Floating point operations had to be 'fast' to enable the induction. 2813 FastMathFlags Flags; 2814 Flags.setFast(); 2815 2816 Value *MulExp = B.CreateFMul(StepValue, Index); 2817 if (isa<Instruction>(MulExp)) 2818 // We have to check, the MulExp may be a constant. 2819 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2820 2821 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2822 "induction"); 2823 if (isa<Instruction>(BOp)) 2824 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2825 2826 return BOp; 2827 } 2828 case InductionDescriptor::IK_NoInduction: 2829 return nullptr; 2830 } 2831 llvm_unreachable("invalid enum"); 2832 } 2833 2834 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2835 /* 2836 In this function we generate a new loop. The new loop will contain 2837 the vectorized instructions while the old loop will continue to run the 2838 scalar remainder. 2839 2840 [ ] <-- loop iteration number check. 2841 / | 2842 / v 2843 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2844 | / | 2845 | / v 2846 || [ ] <-- vector pre header. 2847 |/ | 2848 | v 2849 | [ ] \ 2850 | [ ]_| <-- vector loop. 2851 | | 2852 | v 2853 | -[ ] <--- middle-block. 2854 | / | 2855 | / v 2856 -|- >[ ] <--- new preheader. 2857 | | 2858 | v 2859 | [ ] \ 2860 | [ ]_| <-- old scalar loop to handle remainder. 2861 \ | 2862 \ v 2863 >[ ] <-- exit block. 2864 ... 2865 */ 2866 2867 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 2868 BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); 2869 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 2870 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2871 assert(VectorPH && "Invalid loop structure"); 2872 assert(ExitBlock && "Must have an exit block"); 2873 2874 // Some loops have a single integer induction variable, while other loops 2875 // don't. One example is c++ iterators that often have multiple pointer 2876 // induction variables. In the code below we also support a case where we 2877 // don't have a single induction variable. 2878 // 2879 // We try to obtain an induction variable from the original loop as hard 2880 // as possible. However if we don't find one that: 2881 // - is an integer 2882 // - counts from zero, stepping by one 2883 // - is the size of the widest induction variable type 2884 // then we create a new one. 2885 OldInduction = Legal->getPrimaryInduction(); 2886 Type *IdxTy = Legal->getWidestInductionType(); 2887 2888 // Split the single block loop into the two loop structure described above. 2889 BasicBlock *VecBody = 2890 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 2891 BasicBlock *MiddleBlock = 2892 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 2893 BasicBlock *ScalarPH = 2894 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 2895 2896 // Create and register the new vector loop. 2897 Loop *Lp = LI->AllocateLoop(); 2898 Loop *ParentLoop = OrigLoop->getParentLoop(); 2899 2900 // Insert the new loop into the loop nest and register the new basic blocks 2901 // before calling any utilities such as SCEV that require valid LoopInfo. 2902 if (ParentLoop) { 2903 ParentLoop->addChildLoop(Lp); 2904 ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); 2905 ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); 2906 } else { 2907 LI->addTopLevelLoop(Lp); 2908 } 2909 Lp->addBasicBlockToLoop(VecBody, *LI); 2910 2911 // Find the loop boundaries. 2912 Value *Count = getOrCreateTripCount(Lp); 2913 2914 Value *StartIdx = ConstantInt::get(IdxTy, 0); 2915 2916 // Now, compare the new count to zero. If it is zero skip the vector loop and 2917 // jump to the scalar loop. This check also covers the case where the 2918 // backedge-taken count is uint##_max: adding one to it will overflow leading 2919 // to an incorrect trip count of zero. In this (rare) case we will also jump 2920 // to the scalar loop. 2921 emitMinimumIterationCountCheck(Lp, ScalarPH); 2922 2923 // Generate the code to check any assumptions that we've made for SCEV 2924 // expressions. 2925 emitSCEVChecks(Lp, ScalarPH); 2926 2927 // Generate the code that checks in runtime if arrays overlap. We put the 2928 // checks into a separate block to make the more common case of few elements 2929 // faster. 2930 emitMemRuntimeChecks(Lp, ScalarPH); 2931 2932 // Generate the induction variable. 2933 // The loop step is equal to the vectorization factor (num of SIMD elements) 2934 // times the unroll factor (num of SIMD instructions). 2935 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 2936 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 2937 Induction = 2938 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 2939 getDebugLocFromInstOrOperands(OldInduction)); 2940 2941 // We are going to resume the execution of the scalar loop. 2942 // Go over all of the induction variables that we found and fix the 2943 // PHIs that are left in the scalar version of the loop. 2944 // The starting values of PHI nodes depend on the counter of the last 2945 // iteration in the vectorized loop. 2946 // If we come from a bypass edge then we need to start from the original 2947 // start value. 2948 2949 // This variable saves the new starting index for the scalar loop. It is used 2950 // to test if there are any tail iterations left once the vector loop has 2951 // completed. 2952 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 2953 for (auto &InductionEntry : *List) { 2954 PHINode *OrigPhi = InductionEntry.first; 2955 InductionDescriptor II = InductionEntry.second; 2956 2957 // Create phi nodes to merge from the backedge-taken check block. 2958 PHINode *BCResumeVal = PHINode::Create( 2959 OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); 2960 // Copy original phi DL over to the new one. 2961 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 2962 Value *&EndValue = IVEndValues[OrigPhi]; 2963 if (OrigPhi == OldInduction) { 2964 // We know what the end value is. 2965 EndValue = CountRoundDown; 2966 } else { 2967 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 2968 Type *StepType = II.getStep()->getType(); 2969 Instruction::CastOps CastOp = 2970 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 2971 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 2972 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 2973 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 2974 EndValue->setName("ind.end"); 2975 } 2976 2977 // The new PHI merges the original incoming value, in case of a bypass, 2978 // or the value at the end of the vectorized loop. 2979 BCResumeVal->addIncoming(EndValue, MiddleBlock); 2980 2981 // Fix the scalar body counter (PHI node). 2982 // The old induction's phi node in the scalar body needs the truncated 2983 // value. 2984 for (BasicBlock *BB : LoopBypassBlocks) 2985 BCResumeVal->addIncoming(II.getStartValue(), BB); 2986 OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal); 2987 } 2988 2989 // We need the OrigLoop (scalar loop part) latch terminator to help 2990 // produce correct debug info for the middle block BB instructions. 2991 // The legality check stage guarantees that the loop will have a single 2992 // latch. 2993 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 2994 "Scalar loop latch terminator isn't a branch"); 2995 BranchInst *ScalarLatchBr = 2996 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 2997 2998 // Add a check in the middle block to see if we have completed 2999 // all of the iterations in the first vector loop. 3000 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3001 // If tail is to be folded, we know we don't need to run the remainder. 3002 Value *CmpN = Builder.getTrue(); 3003 if (!Cost->foldTailByMasking()) { 3004 CmpN = 3005 CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3006 CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); 3007 3008 // Here we use the same DebugLoc as the scalar loop latch branch instead 3009 // of the corresponding compare because they may have ended up with 3010 // different line numbers and we want to avoid awkward line stepping while 3011 // debugging. Eg. if the compare has got a line number inside the loop. 3012 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3013 } 3014 3015 BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN); 3016 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3017 ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst); 3018 3019 // Get ready to start creating new instructions into the vectorized body. 3020 Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); 3021 3022 // Save the state. 3023 LoopVectorPreHeader = Lp->getLoopPreheader(); 3024 LoopScalarPreHeader = ScalarPH; 3025 LoopMiddleBlock = MiddleBlock; 3026 LoopExitBlock = ExitBlock; 3027 LoopVectorBody = VecBody; 3028 LoopScalarBody = OldBasicBlock; 3029 3030 Optional<MDNode *> VectorizedLoopID = 3031 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3032 LLVMLoopVectorizeFollowupVectorized}); 3033 if (VectorizedLoopID.hasValue()) { 3034 Lp->setLoopID(VectorizedLoopID.getValue()); 3035 3036 // Do not setAlreadyVectorized if loop attributes have been defined 3037 // explicitly. 3038 return LoopVectorPreHeader; 3039 } 3040 3041 // Keep all loop hints from the original loop on the vector loop (we'll 3042 // replace the vectorizer-specific hints below). 3043 if (MDNode *LID = OrigLoop->getLoopID()) 3044 Lp->setLoopID(LID); 3045 3046 LoopVectorizeHints Hints(Lp, true, *ORE); 3047 Hints.setAlreadyVectorized(); 3048 3049 return LoopVectorPreHeader; 3050 } 3051 3052 // Fix up external users of the induction variable. At this point, we are 3053 // in LCSSA form, with all external PHIs that use the IV having one input value, 3054 // coming from the remainder loop. We need those PHIs to also have a correct 3055 // value for the IV when arriving directly from the middle block. 3056 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3057 const InductionDescriptor &II, 3058 Value *CountRoundDown, Value *EndValue, 3059 BasicBlock *MiddleBlock) { 3060 // There are two kinds of external IV usages - those that use the value 3061 // computed in the last iteration (the PHI) and those that use the penultimate 3062 // value (the value that feeds into the phi from the loop latch). 3063 // We allow both, but they, obviously, have different values. 3064 3065 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3066 3067 DenseMap<Value *, Value *> MissingVals; 3068 3069 // An external user of the last iteration's value should see the value that 3070 // the remainder loop uses to initialize its own IV. 3071 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3072 for (User *U : PostInc->users()) { 3073 Instruction *UI = cast<Instruction>(U); 3074 if (!OrigLoop->contains(UI)) { 3075 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3076 MissingVals[UI] = EndValue; 3077 } 3078 } 3079 3080 // An external user of the penultimate value need to see EndValue - Step. 3081 // The simplest way to get this is to recompute it from the constituent SCEVs, 3082 // that is Start + (Step * (CRD - 1)). 3083 for (User *U : OrigPhi->users()) { 3084 auto *UI = cast<Instruction>(U); 3085 if (!OrigLoop->contains(UI)) { 3086 const DataLayout &DL = 3087 OrigLoop->getHeader()->getModule()->getDataLayout(); 3088 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3089 3090 IRBuilder<> B(MiddleBlock->getTerminator()); 3091 Value *CountMinusOne = B.CreateSub( 3092 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3093 Value *CMO = 3094 !II.getStep()->getType()->isIntegerTy() 3095 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3096 II.getStep()->getType()) 3097 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3098 CMO->setName("cast.cmo"); 3099 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3100 Escape->setName("ind.escape"); 3101 MissingVals[UI] = Escape; 3102 } 3103 } 3104 3105 for (auto &I : MissingVals) { 3106 PHINode *PHI = cast<PHINode>(I.first); 3107 // One corner case we have to handle is two IVs "chasing" each-other, 3108 // that is %IV2 = phi [...], [ %IV1, %latch ] 3109 // In this case, if IV1 has an external use, we need to avoid adding both 3110 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3111 // don't already have an incoming value for the middle block. 3112 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3113 PHI->addIncoming(I.second, MiddleBlock); 3114 } 3115 } 3116 3117 namespace { 3118 3119 struct CSEDenseMapInfo { 3120 static bool canHandle(const Instruction *I) { 3121 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3122 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3123 } 3124 3125 static inline Instruction *getEmptyKey() { 3126 return DenseMapInfo<Instruction *>::getEmptyKey(); 3127 } 3128 3129 static inline Instruction *getTombstoneKey() { 3130 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3131 } 3132 3133 static unsigned getHashValue(const Instruction *I) { 3134 assert(canHandle(I) && "Unknown instruction!"); 3135 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3136 I->value_op_end())); 3137 } 3138 3139 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3140 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3141 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3142 return LHS == RHS; 3143 return LHS->isIdenticalTo(RHS); 3144 } 3145 }; 3146 3147 } // end anonymous namespace 3148 3149 ///Perform cse of induction variable instructions. 3150 static void cse(BasicBlock *BB) { 3151 // Perform simple cse. 3152 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3153 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3154 Instruction *In = &*I++; 3155 3156 if (!CSEDenseMapInfo::canHandle(In)) 3157 continue; 3158 3159 // Check if we can replace this instruction with any of the 3160 // visited instructions. 3161 if (Instruction *V = CSEMap.lookup(In)) { 3162 In->replaceAllUsesWith(V); 3163 In->eraseFromParent(); 3164 continue; 3165 } 3166 3167 CSEMap[In] = In; 3168 } 3169 } 3170 3171 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3172 unsigned VF, 3173 bool &NeedToScalarize) { 3174 Function *F = CI->getCalledFunction(); 3175 StringRef FnName = CI->getCalledFunction()->getName(); 3176 Type *ScalarRetTy = CI->getType(); 3177 SmallVector<Type *, 4> Tys, ScalarTys; 3178 for (auto &ArgOp : CI->arg_operands()) 3179 ScalarTys.push_back(ArgOp->getType()); 3180 3181 // Estimate cost of scalarized vector call. The source operands are assumed 3182 // to be vectors, so we need to extract individual elements from there, 3183 // execute VF scalar calls, and then gather the result into the vector return 3184 // value. 3185 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3186 if (VF == 1) 3187 return ScalarCallCost; 3188 3189 // Compute corresponding vector type for return value and arguments. 3190 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3191 for (Type *ScalarTy : ScalarTys) 3192 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3193 3194 // Compute costs of unpacking argument values for the scalar calls and 3195 // packing the return values to a vector. 3196 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3197 3198 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3199 3200 // If we can't emit a vector call for this function, then the currently found 3201 // cost is the cost we need to return. 3202 NeedToScalarize = true; 3203 if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) 3204 return Cost; 3205 3206 // If the corresponding vector cost is cheaper, return its cost. 3207 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3208 if (VectorCallCost < Cost) { 3209 NeedToScalarize = false; 3210 return VectorCallCost; 3211 } 3212 return Cost; 3213 } 3214 3215 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3216 unsigned VF) { 3217 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3218 assert(ID && "Expected intrinsic call!"); 3219 3220 FastMathFlags FMF; 3221 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3222 FMF = FPMO->getFastMathFlags(); 3223 3224 SmallVector<Value *, 4> Operands(CI->arg_operands()); 3225 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF); 3226 } 3227 3228 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3229 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3230 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3231 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3232 } 3233 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3234 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3235 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3236 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3237 } 3238 3239 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3240 // For every instruction `I` in MinBWs, truncate the operands, create a 3241 // truncated version of `I` and reextend its result. InstCombine runs 3242 // later and will remove any ext/trunc pairs. 3243 SmallPtrSet<Value *, 4> Erased; 3244 for (const auto &KV : Cost->getMinimalBitwidths()) { 3245 // If the value wasn't vectorized, we must maintain the original scalar 3246 // type. The absence of the value from VectorLoopValueMap indicates that it 3247 // wasn't vectorized. 3248 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3249 continue; 3250 for (unsigned Part = 0; Part < UF; ++Part) { 3251 Value *I = getOrCreateVectorValue(KV.first, Part); 3252 if (Erased.find(I) != Erased.end() || I->use_empty() || 3253 !isa<Instruction>(I)) 3254 continue; 3255 Type *OriginalTy = I->getType(); 3256 Type *ScalarTruncatedTy = 3257 IntegerType::get(OriginalTy->getContext(), KV.second); 3258 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, 3259 OriginalTy->getVectorNumElements()); 3260 if (TruncatedTy == OriginalTy) 3261 continue; 3262 3263 IRBuilder<> B(cast<Instruction>(I)); 3264 auto ShrinkOperand = [&](Value *V) -> Value * { 3265 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3266 if (ZI->getSrcTy() == TruncatedTy) 3267 return ZI->getOperand(0); 3268 return B.CreateZExtOrTrunc(V, TruncatedTy); 3269 }; 3270 3271 // The actual instruction modification depends on the instruction type, 3272 // unfortunately. 3273 Value *NewI = nullptr; 3274 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3275 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3276 ShrinkOperand(BO->getOperand(1))); 3277 3278 // Any wrapping introduced by shrinking this operation shouldn't be 3279 // considered undefined behavior. So, we can't unconditionally copy 3280 // arithmetic wrapping flags to NewI. 3281 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3282 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3283 NewI = 3284 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3285 ShrinkOperand(CI->getOperand(1))); 3286 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3287 NewI = B.CreateSelect(SI->getCondition(), 3288 ShrinkOperand(SI->getTrueValue()), 3289 ShrinkOperand(SI->getFalseValue())); 3290 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3291 switch (CI->getOpcode()) { 3292 default: 3293 llvm_unreachable("Unhandled cast!"); 3294 case Instruction::Trunc: 3295 NewI = ShrinkOperand(CI->getOperand(0)); 3296 break; 3297 case Instruction::SExt: 3298 NewI = B.CreateSExtOrTrunc( 3299 CI->getOperand(0), 3300 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3301 break; 3302 case Instruction::ZExt: 3303 NewI = B.CreateZExtOrTrunc( 3304 CI->getOperand(0), 3305 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3306 break; 3307 } 3308 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3309 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); 3310 auto *O0 = B.CreateZExtOrTrunc( 3311 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3312 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); 3313 auto *O1 = B.CreateZExtOrTrunc( 3314 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3315 3316 NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); 3317 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3318 // Don't do anything with the operands, just extend the result. 3319 continue; 3320 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3321 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements(); 3322 auto *O0 = B.CreateZExtOrTrunc( 3323 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3324 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3325 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3326 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3327 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements(); 3328 auto *O0 = B.CreateZExtOrTrunc( 3329 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3330 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3331 } else { 3332 // If we don't know what to do, be conservative and don't do anything. 3333 continue; 3334 } 3335 3336 // Lastly, extend the result. 3337 NewI->takeName(cast<Instruction>(I)); 3338 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3339 I->replaceAllUsesWith(Res); 3340 cast<Instruction>(I)->eraseFromParent(); 3341 Erased.insert(I); 3342 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3343 } 3344 } 3345 3346 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3347 for (const auto &KV : Cost->getMinimalBitwidths()) { 3348 // If the value wasn't vectorized, we must maintain the original scalar 3349 // type. The absence of the value from VectorLoopValueMap indicates that it 3350 // wasn't vectorized. 3351 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3352 continue; 3353 for (unsigned Part = 0; Part < UF; ++Part) { 3354 Value *I = getOrCreateVectorValue(KV.first, Part); 3355 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3356 if (Inst && Inst->use_empty()) { 3357 Value *NewI = Inst->getOperand(0); 3358 Inst->eraseFromParent(); 3359 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3360 } 3361 } 3362 } 3363 } 3364 3365 void InnerLoopVectorizer::fixVectorizedLoop() { 3366 // Insert truncates and extends for any truncated instructions as hints to 3367 // InstCombine. 3368 if (VF > 1) 3369 truncateToMinimalBitwidths(); 3370 3371 // Fix widened non-induction PHIs by setting up the PHI operands. 3372 if (OrigPHIsToFix.size()) { 3373 assert(EnableVPlanNativePath && 3374 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3375 fixNonInductionPHIs(); 3376 } 3377 3378 // At this point every instruction in the original loop is widened to a 3379 // vector form. Now we need to fix the recurrences in the loop. These PHI 3380 // nodes are currently empty because we did not want to introduce cycles. 3381 // This is the second stage of vectorizing recurrences. 3382 fixCrossIterationPHIs(); 3383 3384 // Update the dominator tree. 3385 // 3386 // FIXME: After creating the structure of the new loop, the dominator tree is 3387 // no longer up-to-date, and it remains that way until we update it 3388 // here. An out-of-date dominator tree is problematic for SCEV, 3389 // because SCEVExpander uses it to guide code generation. The 3390 // vectorizer use SCEVExpanders in several places. Instead, we should 3391 // keep the dominator tree up-to-date as we go. 3392 updateAnalysis(); 3393 3394 // Fix-up external users of the induction variables. 3395 for (auto &Entry : *Legal->getInductionVars()) 3396 fixupIVUsers(Entry.first, Entry.second, 3397 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3398 IVEndValues[Entry.first], LoopMiddleBlock); 3399 3400 fixLCSSAPHIs(); 3401 for (Instruction *PI : PredicatedInstructions) 3402 sinkScalarOperands(&*PI); 3403 3404 // Remove redundant induction instructions. 3405 cse(LoopVectorBody); 3406 } 3407 3408 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3409 // In order to support recurrences we need to be able to vectorize Phi nodes. 3410 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3411 // stage #2: We now need to fix the recurrences by adding incoming edges to 3412 // the currently empty PHI nodes. At this point every instruction in the 3413 // original loop is widened to a vector form so we can use them to construct 3414 // the incoming edges. 3415 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3416 // Handle first-order recurrences and reductions that need to be fixed. 3417 if (Legal->isFirstOrderRecurrence(&Phi)) 3418 fixFirstOrderRecurrence(&Phi); 3419 else if (Legal->isReductionVariable(&Phi)) 3420 fixReduction(&Phi); 3421 } 3422 } 3423 3424 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3425 // This is the second phase of vectorizing first-order recurrences. An 3426 // overview of the transformation is described below. Suppose we have the 3427 // following loop. 3428 // 3429 // for (int i = 0; i < n; ++i) 3430 // b[i] = a[i] - a[i - 1]; 3431 // 3432 // There is a first-order recurrence on "a". For this loop, the shorthand 3433 // scalar IR looks like: 3434 // 3435 // scalar.ph: 3436 // s_init = a[-1] 3437 // br scalar.body 3438 // 3439 // scalar.body: 3440 // i = phi [0, scalar.ph], [i+1, scalar.body] 3441 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3442 // s2 = a[i] 3443 // b[i] = s2 - s1 3444 // br cond, scalar.body, ... 3445 // 3446 // In this example, s1 is a recurrence because it's value depends on the 3447 // previous iteration. In the first phase of vectorization, we created a 3448 // temporary value for s1. We now complete the vectorization and produce the 3449 // shorthand vector IR shown below (for VF = 4, UF = 1). 3450 // 3451 // vector.ph: 3452 // v_init = vector(..., ..., ..., a[-1]) 3453 // br vector.body 3454 // 3455 // vector.body 3456 // i = phi [0, vector.ph], [i+4, vector.body] 3457 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3458 // v2 = a[i, i+1, i+2, i+3]; 3459 // v3 = vector(v1(3), v2(0, 1, 2)) 3460 // b[i, i+1, i+2, i+3] = v2 - v3 3461 // br cond, vector.body, middle.block 3462 // 3463 // middle.block: 3464 // x = v2(3) 3465 // br scalar.ph 3466 // 3467 // scalar.ph: 3468 // s_init = phi [x, middle.block], [a[-1], otherwise] 3469 // br scalar.body 3470 // 3471 // After execution completes the vector loop, we extract the next value of 3472 // the recurrence (x) to use as the initial value in the scalar loop. 3473 3474 // Get the original loop preheader and single loop latch. 3475 auto *Preheader = OrigLoop->getLoopPreheader(); 3476 auto *Latch = OrigLoop->getLoopLatch(); 3477 3478 // Get the initial and previous values of the scalar recurrence. 3479 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3480 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3481 3482 // Create a vector from the initial value. 3483 auto *VectorInit = ScalarInit; 3484 if (VF > 1) { 3485 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3486 VectorInit = Builder.CreateInsertElement( 3487 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3488 Builder.getInt32(VF - 1), "vector.recur.init"); 3489 } 3490 3491 // We constructed a temporary phi node in the first phase of vectorization. 3492 // This phi node will eventually be deleted. 3493 Builder.SetInsertPoint( 3494 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3495 3496 // Create a phi node for the new recurrence. The current value will either be 3497 // the initial value inserted into a vector or loop-varying vector value. 3498 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3499 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3500 3501 // Get the vectorized previous value of the last part UF - 1. It appears last 3502 // among all unrolled iterations, due to the order of their construction. 3503 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3504 3505 // Set the insertion point after the previous value if it is an instruction. 3506 // Note that the previous value may have been constant-folded so it is not 3507 // guaranteed to be an instruction in the vector loop. Also, if the previous 3508 // value is a phi node, we should insert after all the phi nodes to avoid 3509 // breaking basic block verification. 3510 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) || 3511 isa<PHINode>(PreviousLastPart)) 3512 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3513 else 3514 Builder.SetInsertPoint( 3515 &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart))); 3516 3517 // We will construct a vector for the recurrence by combining the values for 3518 // the current and previous iterations. This is the required shuffle mask. 3519 SmallVector<Constant *, 8> ShuffleMask(VF); 3520 ShuffleMask[0] = Builder.getInt32(VF - 1); 3521 for (unsigned I = 1; I < VF; ++I) 3522 ShuffleMask[I] = Builder.getInt32(I + VF - 1); 3523 3524 // The vector from which to take the initial value for the current iteration 3525 // (actual or unrolled). Initially, this is the vector phi node. 3526 Value *Incoming = VecPhi; 3527 3528 // Shuffle the current and previous vector and update the vector parts. 3529 for (unsigned Part = 0; Part < UF; ++Part) { 3530 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3531 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3532 auto *Shuffle = 3533 VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3534 ConstantVector::get(ShuffleMask)) 3535 : Incoming; 3536 PhiPart->replaceAllUsesWith(Shuffle); 3537 cast<Instruction>(PhiPart)->eraseFromParent(); 3538 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3539 Incoming = PreviousPart; 3540 } 3541 3542 // Fix the latch value of the new recurrence in the vector loop. 3543 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3544 3545 // Extract the last vector element in the middle block. This will be the 3546 // initial value for the recurrence when jumping to the scalar loop. 3547 auto *ExtractForScalar = Incoming; 3548 if (VF > 1) { 3549 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3550 ExtractForScalar = Builder.CreateExtractElement( 3551 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3552 } 3553 // Extract the second last element in the middle block if the 3554 // Phi is used outside the loop. We need to extract the phi itself 3555 // and not the last element (the phi update in the current iteration). This 3556 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3557 // when the scalar loop is not run at all. 3558 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3559 if (VF > 1) 3560 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3561 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3562 // When loop is unrolled without vectorizing, initialize 3563 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3564 // `Incoming`. This is analogous to the vectorized case above: extracting the 3565 // second last element when VF > 1. 3566 else if (UF > 1) 3567 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3568 3569 // Fix the initial value of the original recurrence in the scalar loop. 3570 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3571 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3572 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3573 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3574 Start->addIncoming(Incoming, BB); 3575 } 3576 3577 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3578 Phi->setName("scalar.recur"); 3579 3580 // Finally, fix users of the recurrence outside the loop. The users will need 3581 // either the last value of the scalar recurrence or the last value of the 3582 // vector recurrence we extracted in the middle block. Since the loop is in 3583 // LCSSA form, we just need to find all the phi nodes for the original scalar 3584 // recurrence in the exit block, and then add an edge for the middle block. 3585 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3586 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3587 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3588 } 3589 } 3590 } 3591 3592 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3593 Constant *Zero = Builder.getInt32(0); 3594 3595 // Get it's reduction variable descriptor. 3596 assert(Legal->isReductionVariable(Phi) && 3597 "Unable to find the reduction variable"); 3598 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; 3599 3600 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3601 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3602 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3603 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3604 RdxDesc.getMinMaxRecurrenceKind(); 3605 setDebugLocFromInst(Builder, ReductionStartValue); 3606 3607 // We need to generate a reduction vector from the incoming scalar. 3608 // To do so, we need to generate the 'identity' vector and override 3609 // one of the elements with the incoming scalar reduction. We need 3610 // to do it in the vector-loop preheader. 3611 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3612 3613 // This is the vector-clone of the value that leaves the loop. 3614 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3615 3616 // Find the reduction identity variable. Zero for addition, or, xor, 3617 // one for multiplication, -1 for And. 3618 Value *Identity; 3619 Value *VectorStart; 3620 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3621 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3622 // MinMax reduction have the start value as their identify. 3623 if (VF == 1) { 3624 VectorStart = Identity = ReductionStartValue; 3625 } else { 3626 VectorStart = Identity = 3627 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3628 } 3629 } else { 3630 // Handle other reduction kinds: 3631 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3632 RK, VecTy->getScalarType()); 3633 if (VF == 1) { 3634 Identity = Iden; 3635 // This vector is the Identity vector where the first element is the 3636 // incoming scalar reduction. 3637 VectorStart = ReductionStartValue; 3638 } else { 3639 Identity = ConstantVector::getSplat(VF, Iden); 3640 3641 // This vector is the Identity vector where the first element is the 3642 // incoming scalar reduction. 3643 VectorStart = 3644 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3645 } 3646 } 3647 3648 // Fix the vector-loop phi. 3649 3650 // Reductions do not have to start at zero. They can start with 3651 // any loop invariant values. 3652 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3653 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3654 for (unsigned Part = 0; Part < UF; ++Part) { 3655 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3656 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3657 // Make sure to add the reduction stat value only to the 3658 // first unroll part. 3659 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3660 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3661 cast<PHINode>(VecRdxPhi) 3662 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3663 } 3664 3665 // Before each round, move the insertion point right between 3666 // the PHIs and the values we are going to write. 3667 // This allows us to write both PHINodes and the extractelement 3668 // instructions. 3669 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3670 3671 setDebugLocFromInst(Builder, LoopExitInst); 3672 3673 // If the vector reduction can be performed in a smaller type, we truncate 3674 // then extend the loop exit value to enable InstCombine to evaluate the 3675 // entire expression in the smaller type. 3676 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3677 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3678 Builder.SetInsertPoint( 3679 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3680 VectorParts RdxParts(UF); 3681 for (unsigned Part = 0; Part < UF; ++Part) { 3682 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3683 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3684 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3685 : Builder.CreateZExt(Trunc, VecTy); 3686 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3687 UI != RdxParts[Part]->user_end();) 3688 if (*UI != Trunc) { 3689 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3690 RdxParts[Part] = Extnd; 3691 } else { 3692 ++UI; 3693 } 3694 } 3695 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3696 for (unsigned Part = 0; Part < UF; ++Part) { 3697 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3698 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3699 } 3700 } 3701 3702 // Reduce all of the unrolled parts into a single vector. 3703 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3704 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3705 3706 // The middle block terminator has already been assigned a DebugLoc here (the 3707 // OrigLoop's single latch terminator). We want the whole middle block to 3708 // appear to execute on this line because: (a) it is all compiler generated, 3709 // (b) these instructions are always executed after evaluating the latch 3710 // conditional branch, and (c) other passes may add new predecessors which 3711 // terminate on this line. This is the easiest way to ensure we don't 3712 // accidentally cause an extra step back into the loop while debugging. 3713 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3714 for (unsigned Part = 1; Part < UF; ++Part) { 3715 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3716 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3717 // Floating point operations had to be 'fast' to enable the reduction. 3718 ReducedPartRdx = addFastMathFlag( 3719 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3720 ReducedPartRdx, "bin.rdx"), 3721 RdxDesc.getFastMathFlags()); 3722 else 3723 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3724 RdxPart); 3725 } 3726 3727 if (VF > 1) { 3728 bool NoNaN = Legal->hasFunNoNaNAttr(); 3729 ReducedPartRdx = 3730 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3731 // If the reduction can be performed in a smaller type, we need to extend 3732 // the reduction to the wider type before we branch to the original loop. 3733 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3734 ReducedPartRdx = 3735 RdxDesc.isSigned() 3736 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3737 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3738 } 3739 3740 // Create a phi node that merges control-flow from the backedge-taken check 3741 // block and the middle block. 3742 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3743 LoopScalarPreHeader->getTerminator()); 3744 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3745 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3746 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3747 3748 // Now, we need to fix the users of the reduction variable 3749 // inside and outside of the scalar remainder loop. 3750 // We know that the loop is in LCSSA form. We need to update the 3751 // PHI nodes in the exit blocks. 3752 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3753 // All PHINodes need to have a single entry edge, or two if 3754 // we already fixed them. 3755 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3756 3757 // We found a reduction value exit-PHI. Update it with the 3758 // incoming bypass edge. 3759 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3760 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3761 } // end of the LCSSA phi scan. 3762 3763 // Fix the scalar loop reduction variable with the incoming reduction sum 3764 // from the vector body and from the backedge value. 3765 int IncomingEdgeBlockIdx = 3766 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3767 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3768 // Pick the other block. 3769 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3770 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3771 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3772 } 3773 3774 void InnerLoopVectorizer::fixLCSSAPHIs() { 3775 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3776 if (LCSSAPhi.getNumIncomingValues() == 1) { 3777 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3778 // Non-instruction incoming values will have only one value. 3779 unsigned LastLane = 0; 3780 if (isa<Instruction>(IncomingValue)) 3781 LastLane = Cost->isUniformAfterVectorization( 3782 cast<Instruction>(IncomingValue), VF) 3783 ? 0 3784 : VF - 1; 3785 // Can be a loop invariant incoming value or the last scalar value to be 3786 // extracted from the vectorized loop. 3787 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3788 Value *lastIncomingValue = 3789 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3790 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3791 } 3792 } 3793 } 3794 3795 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3796 // The basic block and loop containing the predicated instruction. 3797 auto *PredBB = PredInst->getParent(); 3798 auto *VectorLoop = LI->getLoopFor(PredBB); 3799 3800 // Initialize a worklist with the operands of the predicated instruction. 3801 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3802 3803 // Holds instructions that we need to analyze again. An instruction may be 3804 // reanalyzed if we don't yet know if we can sink it or not. 3805 SmallVector<Instruction *, 8> InstsToReanalyze; 3806 3807 // Returns true if a given use occurs in the predicated block. Phi nodes use 3808 // their operands in their corresponding predecessor blocks. 3809 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3810 auto *I = cast<Instruction>(U.getUser()); 3811 BasicBlock *BB = I->getParent(); 3812 if (auto *Phi = dyn_cast<PHINode>(I)) 3813 BB = Phi->getIncomingBlock( 3814 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3815 return BB == PredBB; 3816 }; 3817 3818 // Iteratively sink the scalarized operands of the predicated instruction 3819 // into the block we created for it. When an instruction is sunk, it's 3820 // operands are then added to the worklist. The algorithm ends after one pass 3821 // through the worklist doesn't sink a single instruction. 3822 bool Changed; 3823 do { 3824 // Add the instructions that need to be reanalyzed to the worklist, and 3825 // reset the changed indicator. 3826 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3827 InstsToReanalyze.clear(); 3828 Changed = false; 3829 3830 while (!Worklist.empty()) { 3831 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3832 3833 // We can't sink an instruction if it is a phi node, is already in the 3834 // predicated block, is not in the loop, or may have side effects. 3835 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 3836 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 3837 continue; 3838 3839 // It's legal to sink the instruction if all its uses occur in the 3840 // predicated block. Otherwise, there's nothing to do yet, and we may 3841 // need to reanalyze the instruction. 3842 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 3843 InstsToReanalyze.push_back(I); 3844 continue; 3845 } 3846 3847 // Move the instruction to the beginning of the predicated block, and add 3848 // it's operands to the worklist. 3849 I->moveBefore(&*PredBB->getFirstInsertionPt()); 3850 Worklist.insert(I->op_begin(), I->op_end()); 3851 3852 // The sinking may have enabled other instructions to be sunk, so we will 3853 // need to iterate. 3854 Changed = true; 3855 } 3856 } while (Changed); 3857 } 3858 3859 void InnerLoopVectorizer::fixNonInductionPHIs() { 3860 for (PHINode *OrigPhi : OrigPHIsToFix) { 3861 PHINode *NewPhi = 3862 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 3863 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 3864 3865 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 3866 predecessors(OrigPhi->getParent())); 3867 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 3868 predecessors(NewPhi->getParent())); 3869 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 3870 "Scalar and Vector BB should have the same number of predecessors"); 3871 3872 // The insertion point in Builder may be invalidated by the time we get 3873 // here. Force the Builder insertion point to something valid so that we do 3874 // not run into issues during insertion point restore in 3875 // getOrCreateVectorValue calls below. 3876 Builder.SetInsertPoint(NewPhi); 3877 3878 // The predecessor order is preserved and we can rely on mapping between 3879 // scalar and vector block predecessors. 3880 for (unsigned i = 0; i < NumIncomingValues; ++i) { 3881 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 3882 3883 // When looking up the new scalar/vector values to fix up, use incoming 3884 // values from original phi. 3885 Value *ScIncV = 3886 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 3887 3888 // Scalar incoming value may need a broadcast 3889 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 3890 NewPhi->addIncoming(NewIncV, NewPredBB); 3891 } 3892 } 3893 } 3894 3895 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 3896 unsigned VF) { 3897 PHINode *P = cast<PHINode>(PN); 3898 if (EnableVPlanNativePath) { 3899 // Currently we enter here in the VPlan-native path for non-induction 3900 // PHIs where all control flow is uniform. We simply widen these PHIs. 3901 // Create a vector phi with no operands - the vector phi operands will be 3902 // set at the end of vector code generation. 3903 Type *VecTy = 3904 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 3905 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 3906 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 3907 OrigPHIsToFix.push_back(P); 3908 3909 return; 3910 } 3911 3912 assert(PN->getParent() == OrigLoop->getHeader() && 3913 "Non-header phis should have been handled elsewhere"); 3914 3915 // In order to support recurrences we need to be able to vectorize Phi nodes. 3916 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3917 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 3918 // this value when we vectorize all of the instructions that use the PHI. 3919 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 3920 for (unsigned Part = 0; Part < UF; ++Part) { 3921 // This is phase one of vectorizing PHIs. 3922 Type *VecTy = 3923 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 3924 Value *EntryPart = PHINode::Create( 3925 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 3926 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 3927 } 3928 return; 3929 } 3930 3931 setDebugLocFromInst(Builder, P); 3932 3933 // This PHINode must be an induction variable. 3934 // Make sure that we know about it. 3935 assert(Legal->getInductionVars()->count(P) && "Not an induction variable"); 3936 3937 InductionDescriptor II = Legal->getInductionVars()->lookup(P); 3938 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 3939 3940 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 3941 // which can be found from the original scalar operations. 3942 switch (II.getKind()) { 3943 case InductionDescriptor::IK_NoInduction: 3944 llvm_unreachable("Unknown induction"); 3945 case InductionDescriptor::IK_IntInduction: 3946 case InductionDescriptor::IK_FpInduction: 3947 llvm_unreachable("Integer/fp induction is handled elsewhere."); 3948 case InductionDescriptor::IK_PtrInduction: { 3949 // Handle the pointer induction variable case. 3950 assert(P->getType()->isPointerTy() && "Unexpected type."); 3951 // This is the normalized GEP that starts counting at zero. 3952 Value *PtrInd = Induction; 3953 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 3954 // Determine the number of scalars we need to generate for each unroll 3955 // iteration. If the instruction is uniform, we only need to generate the 3956 // first lane. Otherwise, we generate all VF values. 3957 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 3958 // These are the scalar results. Notice that we don't generate vector GEPs 3959 // because scalar GEPs result in better code. 3960 for (unsigned Part = 0; Part < UF; ++Part) { 3961 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 3962 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 3963 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 3964 Value *SclrGep = 3965 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 3966 SclrGep->setName("next.gep"); 3967 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 3968 } 3969 } 3970 return; 3971 } 3972 } 3973 } 3974 3975 /// A helper function for checking whether an integer division-related 3976 /// instruction may divide by zero (in which case it must be predicated if 3977 /// executed conditionally in the scalar code). 3978 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 3979 /// Non-zero divisors that are non compile-time constants will not be 3980 /// converted into multiplication, so we will still end up scalarizing 3981 /// the division, but can do so w/o predication. 3982 static bool mayDivideByZero(Instruction &I) { 3983 assert((I.getOpcode() == Instruction::UDiv || 3984 I.getOpcode() == Instruction::SDiv || 3985 I.getOpcode() == Instruction::URem || 3986 I.getOpcode() == Instruction::SRem) && 3987 "Unexpected instruction"); 3988 Value *Divisor = I.getOperand(1); 3989 auto *CInt = dyn_cast<ConstantInt>(Divisor); 3990 return !CInt || CInt->isZero(); 3991 } 3992 3993 void InnerLoopVectorizer::widenInstruction(Instruction &I) { 3994 switch (I.getOpcode()) { 3995 case Instruction::Br: 3996 case Instruction::PHI: 3997 llvm_unreachable("This instruction is handled by a different recipe."); 3998 case Instruction::GetElementPtr: { 3999 // Construct a vector GEP by widening the operands of the scalar GEP as 4000 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4001 // results in a vector of pointers when at least one operand of the GEP 4002 // is vector-typed. Thus, to keep the representation compact, we only use 4003 // vector-typed operands for loop-varying values. 4004 auto *GEP = cast<GetElementPtrInst>(&I); 4005 4006 if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) { 4007 // If we are vectorizing, but the GEP has only loop-invariant operands, 4008 // the GEP we build (by only using vector-typed operands for 4009 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4010 // produce a vector of pointers, we need to either arbitrarily pick an 4011 // operand to broadcast, or broadcast a clone of the original GEP. 4012 // Here, we broadcast a clone of the original. 4013 // 4014 // TODO: If at some point we decide to scalarize instructions having 4015 // loop-invariant operands, this special case will no longer be 4016 // required. We would add the scalarization decision to 4017 // collectLoopScalars() and teach getVectorValue() to broadcast 4018 // the lane-zero scalar value. 4019 auto *Clone = Builder.Insert(GEP->clone()); 4020 for (unsigned Part = 0; Part < UF; ++Part) { 4021 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4022 VectorLoopValueMap.setVectorValue(&I, Part, EntryPart); 4023 addMetadata(EntryPart, GEP); 4024 } 4025 } else { 4026 // If the GEP has at least one loop-varying operand, we are sure to 4027 // produce a vector of pointers. But if we are only unrolling, we want 4028 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4029 // produce with the code below will be scalar (if VF == 1) or vector 4030 // (otherwise). Note that for the unroll-only case, we still maintain 4031 // values in the vector mapping with initVector, as we do for other 4032 // instructions. 4033 for (unsigned Part = 0; Part < UF; ++Part) { 4034 // The pointer operand of the new GEP. If it's loop-invariant, we 4035 // won't broadcast it. 4036 auto *Ptr = 4037 OrigLoop->isLoopInvariant(GEP->getPointerOperand()) 4038 ? GEP->getPointerOperand() 4039 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4040 4041 // Collect all the indices for the new GEP. If any index is 4042 // loop-invariant, we won't broadcast it. 4043 SmallVector<Value *, 4> Indices; 4044 for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) { 4045 if (OrigLoop->isLoopInvariant(U.get())) 4046 Indices.push_back(U.get()); 4047 else 4048 Indices.push_back(getOrCreateVectorValue(U.get(), Part)); 4049 } 4050 4051 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4052 // but it should be a vector, otherwise. 4053 auto *NewGEP = 4054 GEP->isInBounds() 4055 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4056 Indices) 4057 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4058 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4059 "NewGEP is not a pointer vector"); 4060 VectorLoopValueMap.setVectorValue(&I, Part, NewGEP); 4061 addMetadata(NewGEP, GEP); 4062 } 4063 } 4064 4065 break; 4066 } 4067 case Instruction::UDiv: 4068 case Instruction::SDiv: 4069 case Instruction::SRem: 4070 case Instruction::URem: 4071 case Instruction::Add: 4072 case Instruction::FAdd: 4073 case Instruction::Sub: 4074 case Instruction::FSub: 4075 case Instruction::FNeg: 4076 case Instruction::Mul: 4077 case Instruction::FMul: 4078 case Instruction::FDiv: 4079 case Instruction::FRem: 4080 case Instruction::Shl: 4081 case Instruction::LShr: 4082 case Instruction::AShr: 4083 case Instruction::And: 4084 case Instruction::Or: 4085 case Instruction::Xor: { 4086 // Just widen unops and binops. 4087 setDebugLocFromInst(Builder, &I); 4088 4089 for (unsigned Part = 0; Part < UF; ++Part) { 4090 SmallVector<Value *, 2> Ops; 4091 for (Value *Op : I.operands()) 4092 Ops.push_back(getOrCreateVectorValue(Op, Part)); 4093 4094 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4095 4096 if (auto *VecOp = dyn_cast<Instruction>(V)) 4097 VecOp->copyIRFlags(&I); 4098 4099 // Use this vector value for all users of the original instruction. 4100 VectorLoopValueMap.setVectorValue(&I, Part, V); 4101 addMetadata(V, &I); 4102 } 4103 4104 break; 4105 } 4106 case Instruction::Select: { 4107 // Widen selects. 4108 // If the selector is loop invariant we can create a select 4109 // instruction with a scalar condition. Otherwise, use vector-select. 4110 auto *SE = PSE.getSE(); 4111 bool InvariantCond = 4112 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); 4113 setDebugLocFromInst(Builder, &I); 4114 4115 // The condition can be loop invariant but still defined inside the 4116 // loop. This means that we can't just use the original 'cond' value. 4117 // We have to take the 'vectorized' value and pick the first lane. 4118 // Instcombine will make this a no-op. 4119 4120 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); 4121 4122 for (unsigned Part = 0; Part < UF; ++Part) { 4123 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); 4124 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); 4125 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); 4126 Value *Sel = 4127 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); 4128 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4129 addMetadata(Sel, &I); 4130 } 4131 4132 break; 4133 } 4134 4135 case Instruction::ICmp: 4136 case Instruction::FCmp: { 4137 // Widen compares. Generate vector compares. 4138 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4139 auto *Cmp = dyn_cast<CmpInst>(&I); 4140 setDebugLocFromInst(Builder, Cmp); 4141 for (unsigned Part = 0; Part < UF; ++Part) { 4142 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); 4143 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); 4144 Value *C = nullptr; 4145 if (FCmp) { 4146 // Propagate fast math flags. 4147 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4148 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4149 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4150 } else { 4151 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4152 } 4153 VectorLoopValueMap.setVectorValue(&I, Part, C); 4154 addMetadata(C, &I); 4155 } 4156 4157 break; 4158 } 4159 4160 case Instruction::ZExt: 4161 case Instruction::SExt: 4162 case Instruction::FPToUI: 4163 case Instruction::FPToSI: 4164 case Instruction::FPExt: 4165 case Instruction::PtrToInt: 4166 case Instruction::IntToPtr: 4167 case Instruction::SIToFP: 4168 case Instruction::UIToFP: 4169 case Instruction::Trunc: 4170 case Instruction::FPTrunc: 4171 case Instruction::BitCast: { 4172 auto *CI = dyn_cast<CastInst>(&I); 4173 setDebugLocFromInst(Builder, CI); 4174 4175 /// Vectorize casts. 4176 Type *DestTy = 4177 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4178 4179 for (unsigned Part = 0; Part < UF; ++Part) { 4180 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); 4181 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4182 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4183 addMetadata(Cast, &I); 4184 } 4185 break; 4186 } 4187 4188 case Instruction::Call: { 4189 // Ignore dbg intrinsics. 4190 if (isa<DbgInfoIntrinsic>(I)) 4191 break; 4192 setDebugLocFromInst(Builder, &I); 4193 4194 Module *M = I.getParent()->getParent()->getParent(); 4195 auto *CI = cast<CallInst>(&I); 4196 4197 StringRef FnName = CI->getCalledFunction()->getName(); 4198 Function *F = CI->getCalledFunction(); 4199 Type *RetTy = ToVectorTy(CI->getType(), VF); 4200 SmallVector<Type *, 4> Tys; 4201 for (Value *ArgOperand : CI->arg_operands()) 4202 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4203 4204 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4205 4206 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4207 // version of the instruction. 4208 // Is it beneficial to perform intrinsic call compared to lib call? 4209 bool NeedToScalarize; 4210 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4211 bool UseVectorIntrinsic = 4212 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4213 assert((UseVectorIntrinsic || !NeedToScalarize) && 4214 "Instruction should be scalarized elsewhere."); 4215 4216 for (unsigned Part = 0; Part < UF; ++Part) { 4217 SmallVector<Value *, 4> Args; 4218 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 4219 Value *Arg = CI->getArgOperand(i); 4220 // Some intrinsics have a scalar argument - don't replace it with a 4221 // vector. 4222 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) 4223 Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); 4224 Args.push_back(Arg); 4225 } 4226 4227 Function *VectorF; 4228 if (UseVectorIntrinsic) { 4229 // Use vector version of the intrinsic. 4230 Type *TysForDecl[] = {CI->getType()}; 4231 if (VF > 1) 4232 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4233 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4234 } else { 4235 // Use vector version of the library call. 4236 StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); 4237 assert(!VFnName.empty() && "Vector function name is empty."); 4238 VectorF = M->getFunction(VFnName); 4239 if (!VectorF) { 4240 // Generate a declaration 4241 FunctionType *FTy = FunctionType::get(RetTy, Tys, false); 4242 VectorF = 4243 Function::Create(FTy, Function::ExternalLinkage, VFnName, M); 4244 VectorF->copyAttributesFrom(F); 4245 } 4246 } 4247 assert(VectorF && "Can't create vector function."); 4248 4249 SmallVector<OperandBundleDef, 1> OpBundles; 4250 CI->getOperandBundlesAsDefs(OpBundles); 4251 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4252 4253 if (isa<FPMathOperator>(V)) 4254 V->copyFastMathFlags(CI); 4255 4256 VectorLoopValueMap.setVectorValue(&I, Part, V); 4257 addMetadata(V, &I); 4258 } 4259 4260 break; 4261 } 4262 4263 default: 4264 // This instruction is not vectorized by simple widening. 4265 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4266 llvm_unreachable("Unhandled instruction!"); 4267 } // end of switch. 4268 } 4269 4270 void InnerLoopVectorizer::updateAnalysis() { 4271 // Forget the original basic block. 4272 PSE.getSE()->forgetLoop(OrigLoop); 4273 4274 // DT is not kept up-to-date for outer loop vectorization 4275 if (EnableVPlanNativePath) 4276 return; 4277 4278 // Update the dominator tree information. 4279 assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && 4280 "Entry does not dominate exit."); 4281 4282 DT->addNewBlock(LoopMiddleBlock, 4283 LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4284 DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); 4285 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 4286 DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); 4287 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 4288 } 4289 4290 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4291 // We should not collect Scalars more than once per VF. Right now, this 4292 // function is called from collectUniformsAndScalars(), which already does 4293 // this check. Collecting Scalars for VF=1 does not make any sense. 4294 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4295 "This function should not be visited twice for the same VF"); 4296 4297 SmallSetVector<Instruction *, 8> Worklist; 4298 4299 // These sets are used to seed the analysis with pointers used by memory 4300 // accesses that will remain scalar. 4301 SmallSetVector<Instruction *, 8> ScalarPtrs; 4302 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4303 4304 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4305 // The pointer operands of loads and stores will be scalar as long as the 4306 // memory access is not a gather or scatter operation. The value operand of a 4307 // store will remain scalar if the store is scalarized. 4308 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4309 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4310 assert(WideningDecision != CM_Unknown && 4311 "Widening decision should be ready at this moment"); 4312 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4313 if (Ptr == Store->getValueOperand()) 4314 return WideningDecision == CM_Scalarize; 4315 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4316 "Ptr is neither a value or pointer operand"); 4317 return WideningDecision != CM_GatherScatter; 4318 }; 4319 4320 // A helper that returns true if the given value is a bitcast or 4321 // getelementptr instruction contained in the loop. 4322 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4323 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4324 isa<GetElementPtrInst>(V)) && 4325 !TheLoop->isLoopInvariant(V); 4326 }; 4327 4328 // A helper that evaluates a memory access's use of a pointer. If the use 4329 // will be a scalar use, and the pointer is only used by memory accesses, we 4330 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4331 // PossibleNonScalarPtrs. 4332 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4333 // We only care about bitcast and getelementptr instructions contained in 4334 // the loop. 4335 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4336 return; 4337 4338 // If the pointer has already been identified as scalar (e.g., if it was 4339 // also identified as uniform), there's nothing to do. 4340 auto *I = cast<Instruction>(Ptr); 4341 if (Worklist.count(I)) 4342 return; 4343 4344 // If the use of the pointer will be a scalar use, and all users of the 4345 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4346 // place the pointer in PossibleNonScalarPtrs. 4347 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4348 return isa<LoadInst>(U) || isa<StoreInst>(U); 4349 })) 4350 ScalarPtrs.insert(I); 4351 else 4352 PossibleNonScalarPtrs.insert(I); 4353 }; 4354 4355 // We seed the scalars analysis with three classes of instructions: (1) 4356 // instructions marked uniform-after-vectorization, (2) bitcast and 4357 // getelementptr instructions used by memory accesses requiring a scalar use, 4358 // and (3) pointer induction variables and their update instructions (we 4359 // currently only scalarize these). 4360 // 4361 // (1) Add to the worklist all instructions that have been identified as 4362 // uniform-after-vectorization. 4363 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4364 4365 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4366 // memory accesses requiring a scalar use. The pointer operands of loads and 4367 // stores will be scalar as long as the memory accesses is not a gather or 4368 // scatter operation. The value operand of a store will remain scalar if the 4369 // store is scalarized. 4370 for (auto *BB : TheLoop->blocks()) 4371 for (auto &I : *BB) { 4372 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4373 evaluatePtrUse(Load, Load->getPointerOperand()); 4374 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4375 evaluatePtrUse(Store, Store->getPointerOperand()); 4376 evaluatePtrUse(Store, Store->getValueOperand()); 4377 } 4378 } 4379 for (auto *I : ScalarPtrs) 4380 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { 4381 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4382 Worklist.insert(I); 4383 } 4384 4385 // (3) Add to the worklist all pointer induction variables and their update 4386 // instructions. 4387 // 4388 // TODO: Once we are able to vectorize pointer induction variables we should 4389 // no longer insert them into the worklist here. 4390 auto *Latch = TheLoop->getLoopLatch(); 4391 for (auto &Induction : *Legal->getInductionVars()) { 4392 auto *Ind = Induction.first; 4393 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4394 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4395 continue; 4396 Worklist.insert(Ind); 4397 Worklist.insert(IndUpdate); 4398 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4399 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4400 << "\n"); 4401 } 4402 4403 // Insert the forced scalars. 4404 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4405 // induction variable when the PHI user is scalarized. 4406 auto ForcedScalar = ForcedScalars.find(VF); 4407 if (ForcedScalar != ForcedScalars.end()) 4408 for (auto *I : ForcedScalar->second) 4409 Worklist.insert(I); 4410 4411 // Expand the worklist by looking through any bitcasts and getelementptr 4412 // instructions we've already identified as scalar. This is similar to the 4413 // expansion step in collectLoopUniforms(); however, here we're only 4414 // expanding to include additional bitcasts and getelementptr instructions. 4415 unsigned Idx = 0; 4416 while (Idx != Worklist.size()) { 4417 Instruction *Dst = Worklist[Idx++]; 4418 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4419 continue; 4420 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4421 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4422 auto *J = cast<Instruction>(U); 4423 return !TheLoop->contains(J) || Worklist.count(J) || 4424 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4425 isScalarUse(J, Src)); 4426 })) { 4427 Worklist.insert(Src); 4428 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4429 } 4430 } 4431 4432 // An induction variable will remain scalar if all users of the induction 4433 // variable and induction variable update remain scalar. 4434 for (auto &Induction : *Legal->getInductionVars()) { 4435 auto *Ind = Induction.first; 4436 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4437 4438 // We already considered pointer induction variables, so there's no reason 4439 // to look at their users again. 4440 // 4441 // TODO: Once we are able to vectorize pointer induction variables we 4442 // should no longer skip over them here. 4443 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4444 continue; 4445 4446 // Determine if all users of the induction variable are scalar after 4447 // vectorization. 4448 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4449 auto *I = cast<Instruction>(U); 4450 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4451 }); 4452 if (!ScalarInd) 4453 continue; 4454 4455 // Determine if all users of the induction variable update instruction are 4456 // scalar after vectorization. 4457 auto ScalarIndUpdate = 4458 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4459 auto *I = cast<Instruction>(U); 4460 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4461 }); 4462 if (!ScalarIndUpdate) 4463 continue; 4464 4465 // The induction variable and its update instruction will remain scalar. 4466 Worklist.insert(Ind); 4467 Worklist.insert(IndUpdate); 4468 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4469 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4470 << "\n"); 4471 } 4472 4473 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4474 } 4475 4476 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4477 if (!blockNeedsPredication(I->getParent())) 4478 return false; 4479 switch(I->getOpcode()) { 4480 default: 4481 break; 4482 case Instruction::Load: 4483 case Instruction::Store: { 4484 if (!Legal->isMaskRequired(I)) 4485 return false; 4486 auto *Ptr = getLoadStorePointerOperand(I); 4487 auto *Ty = getMemInstValueType(I); 4488 // We have already decided how to vectorize this instruction, get that 4489 // result. 4490 if (VF > 1) { 4491 InstWidening WideningDecision = getWideningDecision(I, VF); 4492 assert(WideningDecision != CM_Unknown && 4493 "Widening decision should be ready at this moment"); 4494 return WideningDecision == CM_Scalarize; 4495 } 4496 return isa<LoadInst>(I) ? 4497 !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty)) 4498 : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty)); 4499 } 4500 case Instruction::UDiv: 4501 case Instruction::SDiv: 4502 case Instruction::SRem: 4503 case Instruction::URem: 4504 return mayDivideByZero(*I); 4505 } 4506 return false; 4507 } 4508 4509 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4510 unsigned VF) { 4511 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4512 assert(getWideningDecision(I, VF) == CM_Unknown && 4513 "Decision should not be set yet."); 4514 auto *Group = getInterleavedAccessGroup(I); 4515 assert(Group && "Must have a group."); 4516 4517 // If the instruction's allocated size doesn't equal it's type size, it 4518 // requires padding and will be scalarized. 4519 auto &DL = I->getModule()->getDataLayout(); 4520 auto *ScalarTy = getMemInstValueType(I); 4521 if (hasIrregularType(ScalarTy, DL, VF)) 4522 return false; 4523 4524 // Check if masking is required. 4525 // A Group may need masking for one of two reasons: it resides in a block that 4526 // needs predication, or it was decided to use masking to deal with gaps. 4527 bool PredicatedAccessRequiresMasking = 4528 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4529 bool AccessWithGapsRequiresMasking = 4530 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4531 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4532 return true; 4533 4534 // If masked interleaving is required, we expect that the user/target had 4535 // enabled it, because otherwise it either wouldn't have been created or 4536 // it should have been invalidated by the CostModel. 4537 assert(useMaskedInterleavedAccesses(TTI) && 4538 "Masked interleave-groups for predicated accesses are not enabled."); 4539 4540 auto *Ty = getMemInstValueType(I); 4541 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty) 4542 : TTI.isLegalMaskedStore(Ty); 4543 } 4544 4545 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4546 unsigned VF) { 4547 // Get and ensure we have a valid memory instruction. 4548 LoadInst *LI = dyn_cast<LoadInst>(I); 4549 StoreInst *SI = dyn_cast<StoreInst>(I); 4550 assert((LI || SI) && "Invalid memory instruction"); 4551 4552 auto *Ptr = getLoadStorePointerOperand(I); 4553 4554 // In order to be widened, the pointer should be consecutive, first of all. 4555 if (!Legal->isConsecutivePtr(Ptr)) 4556 return false; 4557 4558 // If the instruction is a store located in a predicated block, it will be 4559 // scalarized. 4560 if (isScalarWithPredication(I)) 4561 return false; 4562 4563 // If the instruction's allocated size doesn't equal it's type size, it 4564 // requires padding and will be scalarized. 4565 auto &DL = I->getModule()->getDataLayout(); 4566 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4567 if (hasIrregularType(ScalarTy, DL, VF)) 4568 return false; 4569 4570 return true; 4571 } 4572 4573 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4574 // We should not collect Uniforms more than once per VF. Right now, 4575 // this function is called from collectUniformsAndScalars(), which 4576 // already does this check. Collecting Uniforms for VF=1 does not make any 4577 // sense. 4578 4579 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4580 "This function should not be visited twice for the same VF"); 4581 4582 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4583 // not analyze again. Uniforms.count(VF) will return 1. 4584 Uniforms[VF].clear(); 4585 4586 // We now know that the loop is vectorizable! 4587 // Collect instructions inside the loop that will remain uniform after 4588 // vectorization. 4589 4590 // Global values, params and instructions outside of current loop are out of 4591 // scope. 4592 auto isOutOfScope = [&](Value *V) -> bool { 4593 Instruction *I = dyn_cast<Instruction>(V); 4594 return (!I || !TheLoop->contains(I)); 4595 }; 4596 4597 SetVector<Instruction *> Worklist; 4598 BasicBlock *Latch = TheLoop->getLoopLatch(); 4599 4600 // Start with the conditional branch. If the branch condition is an 4601 // instruction contained in the loop that is only used by the branch, it is 4602 // uniform. 4603 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4604 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) { 4605 Worklist.insert(Cmp); 4606 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); 4607 } 4608 4609 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4610 // are pointers that are treated like consecutive pointers during 4611 // vectorization. The pointer operands of interleaved accesses are an 4612 // example. 4613 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4614 4615 // Holds pointer operands of instructions that are possibly non-uniform. 4616 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4617 4618 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4619 InstWidening WideningDecision = getWideningDecision(I, VF); 4620 assert(WideningDecision != CM_Unknown && 4621 "Widening decision should be ready at this moment"); 4622 4623 return (WideningDecision == CM_Widen || 4624 WideningDecision == CM_Widen_Reverse || 4625 WideningDecision == CM_Interleave); 4626 }; 4627 // Iterate over the instructions in the loop, and collect all 4628 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4629 // that a consecutive-like pointer operand will be scalarized, we collect it 4630 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4631 // getelementptr instruction can be used by both vectorized and scalarized 4632 // memory instructions. For example, if a loop loads and stores from the same 4633 // location, but the store is conditional, the store will be scalarized, and 4634 // the getelementptr won't remain uniform. 4635 for (auto *BB : TheLoop->blocks()) 4636 for (auto &I : *BB) { 4637 // If there's no pointer operand, there's nothing to do. 4638 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4639 if (!Ptr) 4640 continue; 4641 4642 // True if all users of Ptr are memory accesses that have Ptr as their 4643 // pointer operand. 4644 auto UsersAreMemAccesses = 4645 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4646 return getLoadStorePointerOperand(U) == Ptr; 4647 }); 4648 4649 // Ensure the memory instruction will not be scalarized or used by 4650 // gather/scatter, making its pointer operand non-uniform. If the pointer 4651 // operand is used by any instruction other than a memory access, we 4652 // conservatively assume the pointer operand may be non-uniform. 4653 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4654 PossibleNonUniformPtrs.insert(Ptr); 4655 4656 // If the memory instruction will be vectorized and its pointer operand 4657 // is consecutive-like, or interleaving - the pointer operand should 4658 // remain uniform. 4659 else 4660 ConsecutiveLikePtrs.insert(Ptr); 4661 } 4662 4663 // Add to the Worklist all consecutive and consecutive-like pointers that 4664 // aren't also identified as possibly non-uniform. 4665 for (auto *V : ConsecutiveLikePtrs) 4666 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) { 4667 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n"); 4668 Worklist.insert(V); 4669 } 4670 4671 // Expand Worklist in topological order: whenever a new instruction 4672 // is added , its users should be already inside Worklist. It ensures 4673 // a uniform instruction will only be used by uniform instructions. 4674 unsigned idx = 0; 4675 while (idx != Worklist.size()) { 4676 Instruction *I = Worklist[idx++]; 4677 4678 for (auto OV : I->operand_values()) { 4679 // isOutOfScope operands cannot be uniform instructions. 4680 if (isOutOfScope(OV)) 4681 continue; 4682 // First order recurrence Phi's should typically be considered 4683 // non-uniform. 4684 auto *OP = dyn_cast<PHINode>(OV); 4685 if (OP && Legal->isFirstOrderRecurrence(OP)) 4686 continue; 4687 // If all the users of the operand are uniform, then add the 4688 // operand into the uniform worklist. 4689 auto *OI = cast<Instruction>(OV); 4690 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4691 auto *J = cast<Instruction>(U); 4692 return Worklist.count(J) || 4693 (OI == getLoadStorePointerOperand(J) && 4694 isUniformDecision(J, VF)); 4695 })) { 4696 Worklist.insert(OI); 4697 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); 4698 } 4699 } 4700 } 4701 4702 // Returns true if Ptr is the pointer operand of a memory access instruction 4703 // I, and I is known to not require scalarization. 4704 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4705 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4706 }; 4707 4708 // For an instruction to be added into Worklist above, all its users inside 4709 // the loop should also be in Worklist. However, this condition cannot be 4710 // true for phi nodes that form a cyclic dependence. We must process phi 4711 // nodes separately. An induction variable will remain uniform if all users 4712 // of the induction variable and induction variable update remain uniform. 4713 // The code below handles both pointer and non-pointer induction variables. 4714 for (auto &Induction : *Legal->getInductionVars()) { 4715 auto *Ind = Induction.first; 4716 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4717 4718 // Determine if all users of the induction variable are uniform after 4719 // vectorization. 4720 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4721 auto *I = cast<Instruction>(U); 4722 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4723 isVectorizedMemAccessUse(I, Ind); 4724 }); 4725 if (!UniformInd) 4726 continue; 4727 4728 // Determine if all users of the induction variable update instruction are 4729 // uniform after vectorization. 4730 auto UniformIndUpdate = 4731 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4732 auto *I = cast<Instruction>(U); 4733 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4734 isVectorizedMemAccessUse(I, IndUpdate); 4735 }); 4736 if (!UniformIndUpdate) 4737 continue; 4738 4739 // The induction variable and its update instruction will remain uniform. 4740 Worklist.insert(Ind); 4741 Worklist.insert(IndUpdate); 4742 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n"); 4743 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate 4744 << "\n"); 4745 } 4746 4747 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4748 } 4749 4750 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4751 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4752 4753 if (Legal->getRuntimePointerChecking()->Need) { 4754 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4755 "runtime pointer checks needed. Enable vectorization of this " 4756 "loop with '#pragma clang loop vectorize(enable)' when " 4757 "compiling with -Os/-Oz", 4758 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4759 return true; 4760 } 4761 4762 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4763 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4764 "runtime SCEV checks needed. Enable vectorization of this " 4765 "loop with '#pragma clang loop vectorize(enable)' when " 4766 "compiling with -Os/-Oz", 4767 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4768 return true; 4769 } 4770 4771 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4772 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4773 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4774 "runtime stride == 1 checks needed. Enable vectorization of " 4775 "this loop with '#pragma clang loop vectorize(enable)' when " 4776 "compiling with -Os/-Oz", 4777 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4778 return true; 4779 } 4780 4781 return false; 4782 } 4783 4784 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { 4785 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4786 // TODO: It may by useful to do since it's still likely to be dynamically 4787 // uniform if the target can skip. 4788 reportVectorizationFailure( 4789 "Not inserting runtime ptr check for divergent target", 4790 "runtime pointer checks needed. Not enabled for divergent target", 4791 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4792 return None; 4793 } 4794 4795 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4796 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4797 if (TC == 1) { 4798 reportVectorizationFailure("Single iteration (non) loop", 4799 "loop trip count is one, irrelevant for vectorization", 4800 "SingleIterationLoop", ORE, TheLoop); 4801 return None; 4802 } 4803 4804 switch (ScalarEpilogueStatus) { 4805 case CM_ScalarEpilogueAllowed: 4806 return computeFeasibleMaxVF(TC); 4807 case CM_ScalarEpilogueNotNeededPredicatePragma: 4808 LLVM_DEBUG( 4809 dbgs() << "LV: vector predicate hint found.\n" 4810 << "LV: Not allowing scalar epilogue, creating predicated " 4811 << "vector loop.\n"); 4812 break; 4813 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4814 // fallthrough as a special case of OptForSize 4815 case CM_ScalarEpilogueNotAllowedOptSize: 4816 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4817 LLVM_DEBUG( 4818 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4819 else 4820 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4821 << "count.\n"); 4822 4823 // Bail if runtime checks are required, which are not good when optimising 4824 // for size. 4825 if (runtimeChecksRequired()) 4826 return None; 4827 break; 4828 } 4829 4830 // Now try the tail folding 4831 4832 // Invalidate interleave groups that require an epilogue if we can't mask 4833 // the interleave-group. 4834 if (!useMaskedInterleavedAccesses(TTI)) 4835 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4836 4837 unsigned MaxVF = computeFeasibleMaxVF(TC); 4838 if (TC > 0 && TC % MaxVF == 0) { 4839 // Accept MaxVF if we do not have a tail. 4840 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4841 return MaxVF; 4842 } 4843 4844 // If we don't know the precise trip count, or if the trip count that we 4845 // found modulo the vectorization factor is not zero, try to fold the tail 4846 // by masking. 4847 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4848 if (Legal->canFoldTailByMasking()) { 4849 FoldTailByMasking = true; 4850 return MaxVF; 4851 } 4852 4853 if (TC == 0) { 4854 reportVectorizationFailure( 4855 "Unable to calculate the loop count due to complex control flow", 4856 "unable to calculate the loop count due to complex control flow", 4857 "UnknownLoopCountComplexCFG", ORE, TheLoop); 4858 return None; 4859 } 4860 4861 reportVectorizationFailure( 4862 "Cannot optimize for size and vectorize at the same time.", 4863 "cannot optimize for size and vectorize at the same time. " 4864 "Enable vectorization of this loop with '#pragma clang loop " 4865 "vectorize(enable)' when compiling with -Os/-Oz", 4866 "NoTailLoopWithOptForSize", ORE, TheLoop); 4867 return None; 4868 } 4869 4870 unsigned 4871 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 4872 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4873 unsigned SmallestType, WidestType; 4874 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4875 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 4876 4877 // Get the maximum safe dependence distance in bits computed by LAA. 4878 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4879 // the memory accesses that is most restrictive (involved in the smallest 4880 // dependence distance). 4881 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 4882 4883 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 4884 4885 unsigned MaxVectorSize = WidestRegister / WidestType; 4886 4887 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4888 << " / " << WidestType << " bits.\n"); 4889 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 4890 << WidestRegister << " bits.\n"); 4891 4892 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 4893 " into one vector!"); 4894 if (MaxVectorSize == 0) { 4895 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 4896 MaxVectorSize = 1; 4897 return MaxVectorSize; 4898 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 4899 isPowerOf2_32(ConstTripCount)) { 4900 // We need to clamp the VF to be the ConstTripCount. There is no point in 4901 // choosing a higher viable VF as done in the loop below. 4902 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 4903 << ConstTripCount << "\n"); 4904 MaxVectorSize = ConstTripCount; 4905 return MaxVectorSize; 4906 } 4907 4908 unsigned MaxVF = MaxVectorSize; 4909 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 4910 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 4911 // Collect all viable vectorization factors larger than the default MaxVF 4912 // (i.e. MaxVectorSize). 4913 SmallVector<unsigned, 8> VFs; 4914 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 4915 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 4916 VFs.push_back(VS); 4917 4918 // For each VF calculate its register usage. 4919 auto RUs = calculateRegisterUsage(VFs); 4920 4921 // Select the largest VF which doesn't require more registers than existing 4922 // ones. 4923 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); 4924 for (int i = RUs.size() - 1; i >= 0; --i) { 4925 if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { 4926 MaxVF = VFs[i]; 4927 break; 4928 } 4929 } 4930 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 4931 if (MaxVF < MinVF) { 4932 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 4933 << ") with target's minimum: " << MinVF << '\n'); 4934 MaxVF = MinVF; 4935 } 4936 } 4937 } 4938 return MaxVF; 4939 } 4940 4941 VectorizationFactor 4942 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 4943 float Cost = expectedCost(1).first; 4944 const float ScalarCost = Cost; 4945 unsigned Width = 1; 4946 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 4947 4948 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 4949 if (ForceVectorization && MaxVF > 1) { 4950 // Ignore scalar width, because the user explicitly wants vectorization. 4951 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 4952 // evaluation. 4953 Cost = std::numeric_limits<float>::max(); 4954 } 4955 4956 for (unsigned i = 2; i <= MaxVF; i *= 2) { 4957 // Notice that the vector loop needs to be executed less times, so 4958 // we need to divide the cost of the vector loops by the width of 4959 // the vector elements. 4960 VectorizationCostTy C = expectedCost(i); 4961 float VectorCost = C.first / (float)i; 4962 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 4963 << " costs: " << (int)VectorCost << ".\n"); 4964 if (!C.second && !ForceVectorization) { 4965 LLVM_DEBUG( 4966 dbgs() << "LV: Not considering vector loop of width " << i 4967 << " because it will not generate any vector instructions.\n"); 4968 continue; 4969 } 4970 if (VectorCost < Cost) { 4971 Cost = VectorCost; 4972 Width = i; 4973 } 4974 } 4975 4976 if (!EnableCondStoresVectorization && NumPredStores) { 4977 reportVectorizationFailure("There are conditional stores.", 4978 "store that is conditionally executed prevents vectorization", 4979 "ConditionalStore", ORE, TheLoop); 4980 Width = 1; 4981 Cost = ScalarCost; 4982 } 4983 4984 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 4985 << "LV: Vectorization seems to be not beneficial, " 4986 << "but was forced by a user.\n"); 4987 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 4988 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 4989 return Factor; 4990 } 4991 4992 std::pair<unsigned, unsigned> 4993 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 4994 unsigned MinWidth = -1U; 4995 unsigned MaxWidth = 8; 4996 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 4997 4998 // For each block. 4999 for (BasicBlock *BB : TheLoop->blocks()) { 5000 // For each instruction in the loop. 5001 for (Instruction &I : BB->instructionsWithoutDebug()) { 5002 Type *T = I.getType(); 5003 5004 // Skip ignored values. 5005 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) 5006 continue; 5007 5008 // Only examine Loads, Stores and PHINodes. 5009 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5010 continue; 5011 5012 // Examine PHI nodes that are reduction variables. Update the type to 5013 // account for the recurrence type. 5014 if (auto *PN = dyn_cast<PHINode>(&I)) { 5015 if (!Legal->isReductionVariable(PN)) 5016 continue; 5017 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; 5018 T = RdxDesc.getRecurrenceType(); 5019 } 5020 5021 // Examine the stored values. 5022 if (auto *ST = dyn_cast<StoreInst>(&I)) 5023 T = ST->getValueOperand()->getType(); 5024 5025 // Ignore loaded pointer types and stored pointer types that are not 5026 // vectorizable. 5027 // 5028 // FIXME: The check here attempts to predict whether a load or store will 5029 // be vectorized. We only know this for certain after a VF has 5030 // been selected. Here, we assume that if an access can be 5031 // vectorized, it will be. We should also look at extending this 5032 // optimization to non-pointer types. 5033 // 5034 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5035 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5036 continue; 5037 5038 MinWidth = std::min(MinWidth, 5039 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5040 MaxWidth = std::max(MaxWidth, 5041 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5042 } 5043 } 5044 5045 return {MinWidth, MaxWidth}; 5046 } 5047 5048 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5049 unsigned LoopCost) { 5050 // -- The interleave heuristics -- 5051 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5052 // There are many micro-architectural considerations that we can't predict 5053 // at this level. For example, frontend pressure (on decode or fetch) due to 5054 // code size, or the number and capabilities of the execution ports. 5055 // 5056 // We use the following heuristics to select the interleave count: 5057 // 1. If the code has reductions, then we interleave to break the cross 5058 // iteration dependency. 5059 // 2. If the loop is really small, then we interleave to reduce the loop 5060 // overhead. 5061 // 3. We don't interleave if we think that we will spill registers to memory 5062 // due to the increased register pressure. 5063 5064 if (!isScalarEpilogueAllowed()) 5065 return 1; 5066 5067 // We used the distance for the interleave count. 5068 if (Legal->getMaxSafeDepDistBytes() != -1U) 5069 return 1; 5070 5071 // Do not interleave loops with a relatively small trip count. 5072 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 5073 if (TC > 1 && TC < TinyTripCountInterleaveThreshold) 5074 return 1; 5075 5076 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); 5077 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5078 << " registers\n"); 5079 5080 if (VF == 1) { 5081 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5082 TargetNumRegisters = ForceTargetNumScalarRegs; 5083 } else { 5084 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5085 TargetNumRegisters = ForceTargetNumVectorRegs; 5086 } 5087 5088 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5089 // We divide by these constants so assume that we have at least one 5090 // instruction that uses at least one register. 5091 R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); 5092 5093 // We calculate the interleave count using the following formula. 5094 // Subtract the number of loop invariants from the number of available 5095 // registers. These registers are used by all of the interleaved instances. 5096 // Next, divide the remaining registers by the number of registers that is 5097 // required by the loop, in order to estimate how many parallel instances 5098 // fit without causing spills. All of this is rounded down if necessary to be 5099 // a power of two. We want power of two interleave count to simplify any 5100 // addressing operations or alignment considerations. 5101 // We also want power of two interleave counts to ensure that the induction 5102 // variable of the vector loop wraps to zero, when tail is folded by masking; 5103 // this currently happens when OptForSize, in which case IC is set to 1 above. 5104 unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / 5105 R.MaxLocalUsers); 5106 5107 // Don't count the induction variable as interleaved. 5108 if (EnableIndVarRegisterHeur) 5109 IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / 5110 std::max(1U, (R.MaxLocalUsers - 1))); 5111 5112 // Clamp the interleave ranges to reasonable counts. 5113 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5114 5115 // Check if the user has overridden the max. 5116 if (VF == 1) { 5117 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5118 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5119 } else { 5120 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5121 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5122 } 5123 5124 // If the trip count is constant, limit the interleave count to be less than 5125 // the trip count divided by VF. 5126 if (TC > 0) { 5127 assert(TC >= VF && "VF exceeds trip count?"); 5128 if ((TC / VF) < MaxInterleaveCount) 5129 MaxInterleaveCount = (TC / VF); 5130 } 5131 5132 // If we did not calculate the cost for VF (because the user selected the VF) 5133 // then we calculate the cost of VF here. 5134 if (LoopCost == 0) 5135 LoopCost = expectedCost(VF).first; 5136 5137 assert(LoopCost && "Non-zero loop cost expected"); 5138 5139 // Clamp the calculated IC to be between the 1 and the max interleave count 5140 // that the target and trip count allows. 5141 if (IC > MaxInterleaveCount) 5142 IC = MaxInterleaveCount; 5143 else if (IC < 1) 5144 IC = 1; 5145 5146 // Interleave if we vectorized this loop and there is a reduction that could 5147 // benefit from interleaving. 5148 if (VF > 1 && !Legal->getReductionVars()->empty()) { 5149 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5150 return IC; 5151 } 5152 5153 // Note that if we've already vectorized the loop we will have done the 5154 // runtime check and so interleaving won't require further checks. 5155 bool InterleavingRequiresRuntimePointerCheck = 5156 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5157 5158 // We want to interleave small loops in order to reduce the loop overhead and 5159 // potentially expose ILP opportunities. 5160 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5161 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5162 // We assume that the cost overhead is 1 and we use the cost model 5163 // to estimate the cost of the loop and interleave until the cost of the 5164 // loop overhead is about 5% of the cost of the loop. 5165 unsigned SmallIC = 5166 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5167 5168 // Interleave until store/load ports (estimated by max interleave count) are 5169 // saturated. 5170 unsigned NumStores = Legal->getNumStores(); 5171 unsigned NumLoads = Legal->getNumLoads(); 5172 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5173 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5174 5175 // If we have a scalar reduction (vector reductions are already dealt with 5176 // by this point), we can increase the critical path length if the loop 5177 // we're interleaving is inside another loop. Limit, by default to 2, so the 5178 // critical path only gets increased by one reduction operation. 5179 if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) { 5180 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5181 SmallIC = std::min(SmallIC, F); 5182 StoresIC = std::min(StoresIC, F); 5183 LoadsIC = std::min(LoadsIC, F); 5184 } 5185 5186 if (EnableLoadStoreRuntimeInterleave && 5187 std::max(StoresIC, LoadsIC) > SmallIC) { 5188 LLVM_DEBUG( 5189 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5190 return std::max(StoresIC, LoadsIC); 5191 } 5192 5193 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5194 return SmallIC; 5195 } 5196 5197 // Interleave if this is a large loop (small loops are already dealt with by 5198 // this point) that could benefit from interleaving. 5199 bool HasReductions = !Legal->getReductionVars()->empty(); 5200 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5201 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5202 return IC; 5203 } 5204 5205 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5206 return 1; 5207 } 5208 5209 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5210 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5211 // This function calculates the register usage by measuring the highest number 5212 // of values that are alive at a single location. Obviously, this is a very 5213 // rough estimation. We scan the loop in a topological order in order and 5214 // assign a number to each instruction. We use RPO to ensure that defs are 5215 // met before their users. We assume that each instruction that has in-loop 5216 // users starts an interval. We record every time that an in-loop value is 5217 // used, so we have a list of the first and last occurrences of each 5218 // instruction. Next, we transpose this data structure into a multi map that 5219 // holds the list of intervals that *end* at a specific location. This multi 5220 // map allows us to perform a linear search. We scan the instructions linearly 5221 // and record each time that a new interval starts, by placing it in a set. 5222 // If we find this value in the multi-map then we remove it from the set. 5223 // The max register usage is the maximum size of the set. 5224 // We also search for instructions that are defined outside the loop, but are 5225 // used inside the loop. We need this number separately from the max-interval 5226 // usage number because when we unroll, loop-invariant values do not take 5227 // more register. 5228 LoopBlocksDFS DFS(TheLoop); 5229 DFS.perform(LI); 5230 5231 RegisterUsage RU; 5232 5233 // Each 'key' in the map opens a new interval. The values 5234 // of the map are the index of the 'last seen' usage of the 5235 // instruction that is the key. 5236 using IntervalMap = DenseMap<Instruction *, unsigned>; 5237 5238 // Maps instruction to its index. 5239 SmallVector<Instruction *, 64> IdxToInstr; 5240 // Marks the end of each interval. 5241 IntervalMap EndPoint; 5242 // Saves the list of instruction indices that are used in the loop. 5243 SmallPtrSet<Instruction *, 8> Ends; 5244 // Saves the list of values that are used in the loop but are 5245 // defined outside the loop, such as arguments and constants. 5246 SmallPtrSet<Value *, 8> LoopInvariants; 5247 5248 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5249 for (Instruction &I : BB->instructionsWithoutDebug()) { 5250 IdxToInstr.push_back(&I); 5251 5252 // Save the end location of each USE. 5253 for (Value *U : I.operands()) { 5254 auto *Instr = dyn_cast<Instruction>(U); 5255 5256 // Ignore non-instruction values such as arguments, constants, etc. 5257 if (!Instr) 5258 continue; 5259 5260 // If this instruction is outside the loop then record it and continue. 5261 if (!TheLoop->contains(Instr)) { 5262 LoopInvariants.insert(Instr); 5263 continue; 5264 } 5265 5266 // Overwrite previous end points. 5267 EndPoint[Instr] = IdxToInstr.size(); 5268 Ends.insert(Instr); 5269 } 5270 } 5271 } 5272 5273 // Saves the list of intervals that end with the index in 'key'. 5274 using InstrList = SmallVector<Instruction *, 2>; 5275 DenseMap<unsigned, InstrList> TransposeEnds; 5276 5277 // Transpose the EndPoints to a list of values that end at each index. 5278 for (auto &Interval : EndPoint) 5279 TransposeEnds[Interval.second].push_back(Interval.first); 5280 5281 SmallPtrSet<Instruction *, 8> OpenIntervals; 5282 5283 // Get the size of the widest register. 5284 unsigned MaxSafeDepDist = -1U; 5285 if (Legal->getMaxSafeDepDistBytes() != -1U) 5286 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5287 unsigned WidestRegister = 5288 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5289 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5290 5291 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5292 SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0); 5293 5294 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5295 5296 // A lambda that gets the register usage for the given type and VF. 5297 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5298 if (Ty->isTokenTy()) 5299 return 0U; 5300 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5301 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5302 }; 5303 5304 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5305 Instruction *I = IdxToInstr[i]; 5306 5307 // Remove all of the instructions that end at this location. 5308 InstrList &List = TransposeEnds[i]; 5309 for (Instruction *ToRemove : List) 5310 OpenIntervals.erase(ToRemove); 5311 5312 // Ignore instructions that are never used within the loop. 5313 if (Ends.find(I) == Ends.end()) 5314 continue; 5315 5316 // Skip ignored values. 5317 if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) 5318 continue; 5319 5320 // For each VF find the maximum usage of registers. 5321 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5322 if (VFs[j] == 1) { 5323 MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); 5324 continue; 5325 } 5326 collectUniformsAndScalars(VFs[j]); 5327 // Count the number of live intervals. 5328 unsigned RegUsage = 0; 5329 for (auto Inst : OpenIntervals) { 5330 // Skip ignored values for VF > 1. 5331 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() || 5332 isScalarAfterVectorization(Inst, VFs[j])) 5333 continue; 5334 RegUsage += GetRegUsage(Inst->getType(), VFs[j]); 5335 } 5336 MaxUsages[j] = std::max(MaxUsages[j], RegUsage); 5337 } 5338 5339 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5340 << OpenIntervals.size() << '\n'); 5341 5342 // Add the current instruction to the list of open intervals. 5343 OpenIntervals.insert(I); 5344 } 5345 5346 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5347 unsigned Invariant = 0; 5348 if (VFs[i] == 1) 5349 Invariant = LoopInvariants.size(); 5350 else { 5351 for (auto Inst : LoopInvariants) 5352 Invariant += GetRegUsage(Inst->getType(), VFs[i]); 5353 } 5354 5355 LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); 5356 LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); 5357 LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant 5358 << '\n'); 5359 5360 RU.LoopInvariantRegs = Invariant; 5361 RU.MaxLocalUsers = MaxUsages[i]; 5362 RUs[i] = RU; 5363 } 5364 5365 return RUs; 5366 } 5367 5368 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5369 // TODO: Cost model for emulated masked load/store is completely 5370 // broken. This hack guides the cost model to use an artificially 5371 // high enough value to practically disable vectorization with such 5372 // operations, except where previously deployed legality hack allowed 5373 // using very low cost values. This is to avoid regressions coming simply 5374 // from moving "masked load/store" check from legality to cost model. 5375 // Masked Load/Gather emulation was previously never allowed. 5376 // Limited number of Masked Store/Scatter emulation was allowed. 5377 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5378 return isa<LoadInst>(I) || 5379 (isa<StoreInst>(I) && 5380 NumPredStores > NumberOfStoresToPredicate); 5381 } 5382 5383 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5384 // If we aren't vectorizing the loop, or if we've already collected the 5385 // instructions to scalarize, there's nothing to do. Collection may already 5386 // have occurred if we have a user-selected VF and are now computing the 5387 // expected cost for interleaving. 5388 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5389 return; 5390 5391 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5392 // not profitable to scalarize any instructions, the presence of VF in the 5393 // map will indicate that we've analyzed it already. 5394 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5395 5396 // Find all the instructions that are scalar with predication in the loop and 5397 // determine if it would be better to not if-convert the blocks they are in. 5398 // If so, we also record the instructions to scalarize. 5399 for (BasicBlock *BB : TheLoop->blocks()) { 5400 if (!blockNeedsPredication(BB)) 5401 continue; 5402 for (Instruction &I : *BB) 5403 if (isScalarWithPredication(&I)) { 5404 ScalarCostsTy ScalarCosts; 5405 // Do not apply discount logic if hacked cost is needed 5406 // for emulated masked memrefs. 5407 if (!useEmulatedMaskMemRefHack(&I) && 5408 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5409 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5410 // Remember that BB will remain after vectorization. 5411 PredicatedBBsAfterVectorization.insert(BB); 5412 } 5413 } 5414 } 5415 5416 int LoopVectorizationCostModel::computePredInstDiscount( 5417 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5418 unsigned VF) { 5419 assert(!isUniformAfterVectorization(PredInst, VF) && 5420 "Instruction marked uniform-after-vectorization will be predicated"); 5421 5422 // Initialize the discount to zero, meaning that the scalar version and the 5423 // vector version cost the same. 5424 int Discount = 0; 5425 5426 // Holds instructions to analyze. The instructions we visit are mapped in 5427 // ScalarCosts. Those instructions are the ones that would be scalarized if 5428 // we find that the scalar version costs less. 5429 SmallVector<Instruction *, 8> Worklist; 5430 5431 // Returns true if the given instruction can be scalarized. 5432 auto canBeScalarized = [&](Instruction *I) -> bool { 5433 // We only attempt to scalarize instructions forming a single-use chain 5434 // from the original predicated block that would otherwise be vectorized. 5435 // Although not strictly necessary, we give up on instructions we know will 5436 // already be scalar to avoid traversing chains that are unlikely to be 5437 // beneficial. 5438 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5439 isScalarAfterVectorization(I, VF)) 5440 return false; 5441 5442 // If the instruction is scalar with predication, it will be analyzed 5443 // separately. We ignore it within the context of PredInst. 5444 if (isScalarWithPredication(I)) 5445 return false; 5446 5447 // If any of the instruction's operands are uniform after vectorization, 5448 // the instruction cannot be scalarized. This prevents, for example, a 5449 // masked load from being scalarized. 5450 // 5451 // We assume we will only emit a value for lane zero of an instruction 5452 // marked uniform after vectorization, rather than VF identical values. 5453 // Thus, if we scalarize an instruction that uses a uniform, we would 5454 // create uses of values corresponding to the lanes we aren't emitting code 5455 // for. This behavior can be changed by allowing getScalarValue to clone 5456 // the lane zero values for uniforms rather than asserting. 5457 for (Use &U : I->operands()) 5458 if (auto *J = dyn_cast<Instruction>(U.get())) 5459 if (isUniformAfterVectorization(J, VF)) 5460 return false; 5461 5462 // Otherwise, we can scalarize the instruction. 5463 return true; 5464 }; 5465 5466 // Compute the expected cost discount from scalarizing the entire expression 5467 // feeding the predicated instruction. We currently only consider expressions 5468 // that are single-use instruction chains. 5469 Worklist.push_back(PredInst); 5470 while (!Worklist.empty()) { 5471 Instruction *I = Worklist.pop_back_val(); 5472 5473 // If we've already analyzed the instruction, there's nothing to do. 5474 if (ScalarCosts.find(I) != ScalarCosts.end()) 5475 continue; 5476 5477 // Compute the cost of the vector instruction. Note that this cost already 5478 // includes the scalarization overhead of the predicated instruction. 5479 unsigned VectorCost = getInstructionCost(I, VF).first; 5480 5481 // Compute the cost of the scalarized instruction. This cost is the cost of 5482 // the instruction as if it wasn't if-converted and instead remained in the 5483 // predicated block. We will scale this cost by block probability after 5484 // computing the scalarization overhead. 5485 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5486 5487 // Compute the scalarization overhead of needed insertelement instructions 5488 // and phi nodes. 5489 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5490 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), 5491 true, false); 5492 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); 5493 } 5494 5495 // Compute the scalarization overhead of needed extractelement 5496 // instructions. For each of the instruction's operands, if the operand can 5497 // be scalarized, add it to the worklist; otherwise, account for the 5498 // overhead. 5499 for (Use &U : I->operands()) 5500 if (auto *J = dyn_cast<Instruction>(U.get())) { 5501 assert(VectorType::isValidElementType(J->getType()) && 5502 "Instruction has non-scalar type"); 5503 if (canBeScalarized(J)) 5504 Worklist.push_back(J); 5505 else if (needsExtract(J, VF)) 5506 ScalarCost += TTI.getScalarizationOverhead( 5507 ToVectorTy(J->getType(),VF), false, true); 5508 } 5509 5510 // Scale the total scalar cost by block probability. 5511 ScalarCost /= getReciprocalPredBlockProb(); 5512 5513 // Compute the discount. A non-negative discount means the vector version 5514 // of the instruction costs more, and scalarizing would be beneficial. 5515 Discount += VectorCost - ScalarCost; 5516 ScalarCosts[I] = ScalarCost; 5517 } 5518 5519 return Discount; 5520 } 5521 5522 LoopVectorizationCostModel::VectorizationCostTy 5523 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5524 VectorizationCostTy Cost; 5525 5526 // For each block. 5527 for (BasicBlock *BB : TheLoop->blocks()) { 5528 VectorizationCostTy BlockCost; 5529 5530 // For each instruction in the old loop. 5531 for (Instruction &I : BB->instructionsWithoutDebug()) { 5532 // Skip ignored values. 5533 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || 5534 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) 5535 continue; 5536 5537 VectorizationCostTy C = getInstructionCost(&I, VF); 5538 5539 // Check if we should override the cost. 5540 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5541 C.first = ForceTargetInstructionCost; 5542 5543 BlockCost.first += C.first; 5544 BlockCost.second |= C.second; 5545 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5546 << " for VF " << VF << " For instruction: " << I 5547 << '\n'); 5548 } 5549 5550 // If we are vectorizing a predicated block, it will have been 5551 // if-converted. This means that the block's instructions (aside from 5552 // stores and instructions that may divide by zero) will now be 5553 // unconditionally executed. For the scalar case, we may not always execute 5554 // the predicated block. Thus, scale the block's cost by the probability of 5555 // executing it. 5556 if (VF == 1 && blockNeedsPredication(BB)) 5557 BlockCost.first /= getReciprocalPredBlockProb(); 5558 5559 Cost.first += BlockCost.first; 5560 Cost.second |= BlockCost.second; 5561 } 5562 5563 return Cost; 5564 } 5565 5566 /// Gets Address Access SCEV after verifying that the access pattern 5567 /// is loop invariant except the induction variable dependence. 5568 /// 5569 /// This SCEV can be sent to the Target in order to estimate the address 5570 /// calculation cost. 5571 static const SCEV *getAddressAccessSCEV( 5572 Value *Ptr, 5573 LoopVectorizationLegality *Legal, 5574 PredicatedScalarEvolution &PSE, 5575 const Loop *TheLoop) { 5576 5577 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5578 if (!Gep) 5579 return nullptr; 5580 5581 // We are looking for a gep with all loop invariant indices except for one 5582 // which should be an induction variable. 5583 auto SE = PSE.getSE(); 5584 unsigned NumOperands = Gep->getNumOperands(); 5585 for (unsigned i = 1; i < NumOperands; ++i) { 5586 Value *Opd = Gep->getOperand(i); 5587 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5588 !Legal->isInductionVariable(Opd)) 5589 return nullptr; 5590 } 5591 5592 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5593 return PSE.getSCEV(Ptr); 5594 } 5595 5596 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5597 return Legal->hasStride(I->getOperand(0)) || 5598 Legal->hasStride(I->getOperand(1)); 5599 } 5600 5601 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5602 unsigned VF) { 5603 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5604 Type *ValTy = getMemInstValueType(I); 5605 auto SE = PSE.getSE(); 5606 5607 unsigned Alignment = getLoadStoreAlignment(I); 5608 unsigned AS = getLoadStoreAddressSpace(I); 5609 Value *Ptr = getLoadStorePointerOperand(I); 5610 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5611 5612 // Figure out whether the access is strided and get the stride value 5613 // if it's known in compile time 5614 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5615 5616 // Get the cost of the scalar memory instruction and address computation. 5617 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5618 5619 // Don't pass *I here, since it is scalar but will actually be part of a 5620 // vectorized loop where the user of it is a vectorized instruction. 5621 Cost += VF * 5622 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, 5623 AS); 5624 5625 // Get the overhead of the extractelement and insertelement instructions 5626 // we might create due to scalarization. 5627 Cost += getScalarizationOverhead(I, VF); 5628 5629 // If we have a predicated store, it may not be executed for each vector 5630 // lane. Scale the cost by the probability of executing the predicated 5631 // block. 5632 if (isPredicatedInst(I)) { 5633 Cost /= getReciprocalPredBlockProb(); 5634 5635 if (useEmulatedMaskMemRefHack(I)) 5636 // Artificially setting to a high enough value to practically disable 5637 // vectorization with such operations. 5638 Cost = 3000000; 5639 } 5640 5641 return Cost; 5642 } 5643 5644 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5645 unsigned VF) { 5646 Type *ValTy = getMemInstValueType(I); 5647 Type *VectorTy = ToVectorTy(ValTy, VF); 5648 unsigned Alignment = getLoadStoreAlignment(I); 5649 Value *Ptr = getLoadStorePointerOperand(I); 5650 unsigned AS = getLoadStoreAddressSpace(I); 5651 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5652 5653 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5654 "Stride should be 1 or -1 for consecutive memory access"); 5655 unsigned Cost = 0; 5656 if (Legal->isMaskRequired(I)) 5657 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); 5658 else 5659 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); 5660 5661 bool Reverse = ConsecutiveStride < 0; 5662 if (Reverse) 5663 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5664 return Cost; 5665 } 5666 5667 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5668 unsigned VF) { 5669 Type *ValTy = getMemInstValueType(I); 5670 Type *VectorTy = ToVectorTy(ValTy, VF); 5671 unsigned Alignment = getLoadStoreAlignment(I); 5672 unsigned AS = getLoadStoreAddressSpace(I); 5673 if (isa<LoadInst>(I)) { 5674 return TTI.getAddressComputationCost(ValTy) + 5675 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + 5676 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5677 } 5678 StoreInst *SI = cast<StoreInst>(I); 5679 5680 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5681 return TTI.getAddressComputationCost(ValTy) + 5682 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + 5683 (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost( 5684 Instruction::ExtractElement, 5685 VectorTy, VF - 1)); 5686 } 5687 5688 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5689 unsigned VF) { 5690 Type *ValTy = getMemInstValueType(I); 5691 Type *VectorTy = ToVectorTy(ValTy, VF); 5692 unsigned Alignment = getLoadStoreAlignment(I); 5693 Value *Ptr = getLoadStorePointerOperand(I); 5694 5695 return TTI.getAddressComputationCost(VectorTy) + 5696 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5697 Legal->isMaskRequired(I), Alignment); 5698 } 5699 5700 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5701 unsigned VF) { 5702 Type *ValTy = getMemInstValueType(I); 5703 Type *VectorTy = ToVectorTy(ValTy, VF); 5704 unsigned AS = getLoadStoreAddressSpace(I); 5705 5706 auto Group = getInterleavedAccessGroup(I); 5707 assert(Group && "Fail to get an interleaved access group."); 5708 5709 unsigned InterleaveFactor = Group->getFactor(); 5710 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5711 5712 // Holds the indices of existing members in an interleaved load group. 5713 // An interleaved store group doesn't need this as it doesn't allow gaps. 5714 SmallVector<unsigned, 4> Indices; 5715 if (isa<LoadInst>(I)) { 5716 for (unsigned i = 0; i < InterleaveFactor; i++) 5717 if (Group->getMember(i)) 5718 Indices.push_back(i); 5719 } 5720 5721 // Calculate the cost of the whole interleaved group. 5722 bool UseMaskForGaps = 5723 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5724 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5725 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5726 Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); 5727 5728 if (Group->isReverse()) { 5729 // TODO: Add support for reversed masked interleaved access. 5730 assert(!Legal->isMaskRequired(I) && 5731 "Reverse masked interleaved access not supported."); 5732 Cost += Group->getNumMembers() * 5733 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5734 } 5735 return Cost; 5736 } 5737 5738 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5739 unsigned VF) { 5740 // Calculate scalar cost only. Vectorization cost should be ready at this 5741 // moment. 5742 if (VF == 1) { 5743 Type *ValTy = getMemInstValueType(I); 5744 unsigned Alignment = getLoadStoreAlignment(I); 5745 unsigned AS = getLoadStoreAddressSpace(I); 5746 5747 return TTI.getAddressComputationCost(ValTy) + 5748 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); 5749 } 5750 return getWideningCost(I, VF); 5751 } 5752 5753 LoopVectorizationCostModel::VectorizationCostTy 5754 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5755 // If we know that this instruction will remain uniform, check the cost of 5756 // the scalar version. 5757 if (isUniformAfterVectorization(I, VF)) 5758 VF = 1; 5759 5760 if (VF > 1 && isProfitableToScalarize(I, VF)) 5761 return VectorizationCostTy(InstsToScalarize[VF][I], false); 5762 5763 // Forced scalars do not have any scalarization overhead. 5764 auto ForcedScalar = ForcedScalars.find(VF); 5765 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 5766 auto InstSet = ForcedScalar->second; 5767 if (InstSet.find(I) != InstSet.end()) 5768 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 5769 } 5770 5771 Type *VectorTy; 5772 unsigned C = getInstructionCost(I, VF, VectorTy); 5773 5774 bool TypeNotScalarized = 5775 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 5776 return VectorizationCostTy(C, TypeNotScalarized); 5777 } 5778 5779 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 5780 unsigned VF) { 5781 5782 if (VF == 1) 5783 return 0; 5784 5785 unsigned Cost = 0; 5786 Type *RetTy = ToVectorTy(I->getType(), VF); 5787 if (!RetTy->isVoidTy() && 5788 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 5789 Cost += TTI.getScalarizationOverhead(RetTy, true, false); 5790 5791 // Some targets keep addresses scalar. 5792 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 5793 return Cost; 5794 5795 // Some targets support efficient element stores. 5796 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 5797 return Cost; 5798 5799 // Collect operands to consider. 5800 CallInst *CI = dyn_cast<CallInst>(I); 5801 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 5802 5803 // Skip operands that do not require extraction/scalarization and do not incur 5804 // any overhead. 5805 return Cost + TTI.getOperandsScalarizationOverhead( 5806 filterExtractingOperands(Ops, VF), VF); 5807 } 5808 5809 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 5810 if (VF == 1) 5811 return; 5812 NumPredStores = 0; 5813 for (BasicBlock *BB : TheLoop->blocks()) { 5814 // For each instruction in the old loop. 5815 for (Instruction &I : *BB) { 5816 Value *Ptr = getLoadStorePointerOperand(&I); 5817 if (!Ptr) 5818 continue; 5819 5820 // TODO: We should generate better code and update the cost model for 5821 // predicated uniform stores. Today they are treated as any other 5822 // predicated store (see added test cases in 5823 // invariant-store-vectorization.ll). 5824 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 5825 NumPredStores++; 5826 5827 if (Legal->isUniform(Ptr) && 5828 // Conditional loads and stores should be scalarized and predicated. 5829 // isScalarWithPredication cannot be used here since masked 5830 // gather/scatters are not considered scalar with predication. 5831 !Legal->blockNeedsPredication(I.getParent())) { 5832 // TODO: Avoid replicating loads and stores instead of 5833 // relying on instcombine to remove them. 5834 // Load: Scalar load + broadcast 5835 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 5836 unsigned Cost = getUniformMemOpCost(&I, VF); 5837 setWideningDecision(&I, VF, CM_Scalarize, Cost); 5838 continue; 5839 } 5840 5841 // We assume that widening is the best solution when possible. 5842 if (memoryInstructionCanBeWidened(&I, VF)) { 5843 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 5844 int ConsecutiveStride = 5845 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 5846 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5847 "Expected consecutive stride."); 5848 InstWidening Decision = 5849 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 5850 setWideningDecision(&I, VF, Decision, Cost); 5851 continue; 5852 } 5853 5854 // Choose between Interleaving, Gather/Scatter or Scalarization. 5855 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 5856 unsigned NumAccesses = 1; 5857 if (isAccessInterleaved(&I)) { 5858 auto Group = getInterleavedAccessGroup(&I); 5859 assert(Group && "Fail to get an interleaved access group."); 5860 5861 // Make one decision for the whole group. 5862 if (getWideningDecision(&I, VF) != CM_Unknown) 5863 continue; 5864 5865 NumAccesses = Group->getNumMembers(); 5866 if (interleavedAccessCanBeWidened(&I, VF)) 5867 InterleaveCost = getInterleaveGroupCost(&I, VF); 5868 } 5869 5870 unsigned GatherScatterCost = 5871 isLegalGatherOrScatter(&I) 5872 ? getGatherScatterCost(&I, VF) * NumAccesses 5873 : std::numeric_limits<unsigned>::max(); 5874 5875 unsigned ScalarizationCost = 5876 getMemInstScalarizationCost(&I, VF) * NumAccesses; 5877 5878 // Choose better solution for the current VF, 5879 // write down this decision and use it during vectorization. 5880 unsigned Cost; 5881 InstWidening Decision; 5882 if (InterleaveCost <= GatherScatterCost && 5883 InterleaveCost < ScalarizationCost) { 5884 Decision = CM_Interleave; 5885 Cost = InterleaveCost; 5886 } else if (GatherScatterCost < ScalarizationCost) { 5887 Decision = CM_GatherScatter; 5888 Cost = GatherScatterCost; 5889 } else { 5890 Decision = CM_Scalarize; 5891 Cost = ScalarizationCost; 5892 } 5893 // If the instructions belongs to an interleave group, the whole group 5894 // receives the same decision. The whole group receives the cost, but 5895 // the cost will actually be assigned to one instruction. 5896 if (auto Group = getInterleavedAccessGroup(&I)) 5897 setWideningDecision(Group, VF, Decision, Cost); 5898 else 5899 setWideningDecision(&I, VF, Decision, Cost); 5900 } 5901 } 5902 5903 // Make sure that any load of address and any other address computation 5904 // remains scalar unless there is gather/scatter support. This avoids 5905 // inevitable extracts into address registers, and also has the benefit of 5906 // activating LSR more, since that pass can't optimize vectorized 5907 // addresses. 5908 if (TTI.prefersVectorizedAddressing()) 5909 return; 5910 5911 // Start with all scalar pointer uses. 5912 SmallPtrSet<Instruction *, 8> AddrDefs; 5913 for (BasicBlock *BB : TheLoop->blocks()) 5914 for (Instruction &I : *BB) { 5915 Instruction *PtrDef = 5916 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 5917 if (PtrDef && TheLoop->contains(PtrDef) && 5918 getWideningDecision(&I, VF) != CM_GatherScatter) 5919 AddrDefs.insert(PtrDef); 5920 } 5921 5922 // Add all instructions used to generate the addresses. 5923 SmallVector<Instruction *, 4> Worklist; 5924 for (auto *I : AddrDefs) 5925 Worklist.push_back(I); 5926 while (!Worklist.empty()) { 5927 Instruction *I = Worklist.pop_back_val(); 5928 for (auto &Op : I->operands()) 5929 if (auto *InstOp = dyn_cast<Instruction>(Op)) 5930 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 5931 AddrDefs.insert(InstOp).second) 5932 Worklist.push_back(InstOp); 5933 } 5934 5935 for (auto *I : AddrDefs) { 5936 if (isa<LoadInst>(I)) { 5937 // Setting the desired widening decision should ideally be handled in 5938 // by cost functions, but since this involves the task of finding out 5939 // if the loaded register is involved in an address computation, it is 5940 // instead changed here when we know this is the case. 5941 InstWidening Decision = getWideningDecision(I, VF); 5942 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 5943 // Scalarize a widened load of address. 5944 setWideningDecision(I, VF, CM_Scalarize, 5945 (VF * getMemoryInstructionCost(I, 1))); 5946 else if (auto Group = getInterleavedAccessGroup(I)) { 5947 // Scalarize an interleave group of address loads. 5948 for (unsigned I = 0; I < Group->getFactor(); ++I) { 5949 if (Instruction *Member = Group->getMember(I)) 5950 setWideningDecision(Member, VF, CM_Scalarize, 5951 (VF * getMemoryInstructionCost(Member, 1))); 5952 } 5953 } 5954 } else 5955 // Make sure I gets scalarized and a cost estimate without 5956 // scalarization overhead. 5957 ForcedScalars[VF].insert(I); 5958 } 5959 } 5960 5961 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 5962 unsigned VF, 5963 Type *&VectorTy) { 5964 Type *RetTy = I->getType(); 5965 if (canTruncateToMinimalBitwidth(I, VF)) 5966 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 5967 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 5968 auto SE = PSE.getSE(); 5969 5970 // TODO: We need to estimate the cost of intrinsic calls. 5971 switch (I->getOpcode()) { 5972 case Instruction::GetElementPtr: 5973 // We mark this instruction as zero-cost because the cost of GEPs in 5974 // vectorized code depends on whether the corresponding memory instruction 5975 // is scalarized or not. Therefore, we handle GEPs with the memory 5976 // instruction cost. 5977 return 0; 5978 case Instruction::Br: { 5979 // In cases of scalarized and predicated instructions, there will be VF 5980 // predicated blocks in the vectorized loop. Each branch around these 5981 // blocks requires also an extract of its vector compare i1 element. 5982 bool ScalarPredicatedBB = false; 5983 BranchInst *BI = cast<BranchInst>(I); 5984 if (VF > 1 && BI->isConditional() && 5985 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != 5986 PredicatedBBsAfterVectorization.end() || 5987 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != 5988 PredicatedBBsAfterVectorization.end())) 5989 ScalarPredicatedBB = true; 5990 5991 if (ScalarPredicatedBB) { 5992 // Return cost for branches around scalarized and predicated blocks. 5993 Type *Vec_i1Ty = 5994 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 5995 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + 5996 (TTI.getCFInstrCost(Instruction::Br) * VF)); 5997 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 5998 // The back-edge branch will remain, as will all scalar branches. 5999 return TTI.getCFInstrCost(Instruction::Br); 6000 else 6001 // This branch will be eliminated by if-conversion. 6002 return 0; 6003 // Note: We currently assume zero cost for an unconditional branch inside 6004 // a predicated block since it will become a fall-through, although we 6005 // may decide in the future to call TTI for all branches. 6006 } 6007 case Instruction::PHI: { 6008 auto *Phi = cast<PHINode>(I); 6009 6010 // First-order recurrences are replaced by vector shuffles inside the loop. 6011 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6012 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6013 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6014 VectorTy, VF - 1, VectorType::get(RetTy, 1)); 6015 6016 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6017 // converted into select instructions. We require N - 1 selects per phi 6018 // node, where N is the number of incoming values. 6019 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6020 return (Phi->getNumIncomingValues() - 1) * 6021 TTI.getCmpSelInstrCost( 6022 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6023 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); 6024 6025 return TTI.getCFInstrCost(Instruction::PHI); 6026 } 6027 case Instruction::UDiv: 6028 case Instruction::SDiv: 6029 case Instruction::URem: 6030 case Instruction::SRem: 6031 // If we have a predicated instruction, it may not be executed for each 6032 // vector lane. Get the scalarization cost and scale this amount by the 6033 // probability of executing the predicated block. If the instruction is not 6034 // predicated, we fall through to the next case. 6035 if (VF > 1 && isScalarWithPredication(I)) { 6036 unsigned Cost = 0; 6037 6038 // These instructions have a non-void type, so account for the phi nodes 6039 // that we will create. This cost is likely to be zero. The phi node 6040 // cost, if any, should be scaled by the block probability because it 6041 // models a copy at the end of each predicated block. 6042 Cost += VF * TTI.getCFInstrCost(Instruction::PHI); 6043 6044 // The cost of the non-predicated instruction. 6045 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); 6046 6047 // The cost of insertelement and extractelement instructions needed for 6048 // scalarization. 6049 Cost += getScalarizationOverhead(I, VF); 6050 6051 // Scale the cost by the probability of executing the predicated blocks. 6052 // This assumes the predicated block for each vector lane is equally 6053 // likely. 6054 return Cost / getReciprocalPredBlockProb(); 6055 } 6056 LLVM_FALLTHROUGH; 6057 case Instruction::Add: 6058 case Instruction::FAdd: 6059 case Instruction::Sub: 6060 case Instruction::FSub: 6061 case Instruction::Mul: 6062 case Instruction::FMul: 6063 case Instruction::FDiv: 6064 case Instruction::FRem: 6065 case Instruction::Shl: 6066 case Instruction::LShr: 6067 case Instruction::AShr: 6068 case Instruction::And: 6069 case Instruction::Or: 6070 case Instruction::Xor: { 6071 // Since we will replace the stride by 1 the multiplication should go away. 6072 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6073 return 0; 6074 // Certain instructions can be cheaper to vectorize if they have a constant 6075 // second vector operand. One example of this are shifts on x86. 6076 Value *Op2 = I->getOperand(1); 6077 TargetTransformInfo::OperandValueProperties Op2VP; 6078 TargetTransformInfo::OperandValueKind Op2VK = 6079 TTI.getOperandInfo(Op2, Op2VP); 6080 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6081 Op2VK = TargetTransformInfo::OK_UniformValue; 6082 6083 SmallVector<const Value *, 4> Operands(I->operand_values()); 6084 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6085 return N * TTI.getArithmeticInstrCost( 6086 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6087 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands); 6088 } 6089 case Instruction::FNeg: { 6090 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6091 return N * TTI.getArithmeticInstrCost( 6092 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6093 TargetTransformInfo::OK_AnyValue, 6094 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6095 I->getOperand(0)); 6096 } 6097 case Instruction::Select: { 6098 SelectInst *SI = cast<SelectInst>(I); 6099 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6100 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6101 Type *CondTy = SI->getCondition()->getType(); 6102 if (!ScalarCond) 6103 CondTy = VectorType::get(CondTy, VF); 6104 6105 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); 6106 } 6107 case Instruction::ICmp: 6108 case Instruction::FCmp: { 6109 Type *ValTy = I->getOperand(0)->getType(); 6110 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6111 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6112 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6113 VectorTy = ToVectorTy(ValTy, VF); 6114 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); 6115 } 6116 case Instruction::Store: 6117 case Instruction::Load: { 6118 unsigned Width = VF; 6119 if (Width > 1) { 6120 InstWidening Decision = getWideningDecision(I, Width); 6121 assert(Decision != CM_Unknown && 6122 "CM decision should be taken at this point"); 6123 if (Decision == CM_Scalarize) 6124 Width = 1; 6125 } 6126 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6127 return getMemoryInstructionCost(I, VF); 6128 } 6129 case Instruction::ZExt: 6130 case Instruction::SExt: 6131 case Instruction::FPToUI: 6132 case Instruction::FPToSI: 6133 case Instruction::FPExt: 6134 case Instruction::PtrToInt: 6135 case Instruction::IntToPtr: 6136 case Instruction::SIToFP: 6137 case Instruction::UIToFP: 6138 case Instruction::Trunc: 6139 case Instruction::FPTrunc: 6140 case Instruction::BitCast: { 6141 // We optimize the truncation of induction variables having constant 6142 // integer steps. The cost of these truncations is the same as the scalar 6143 // operation. 6144 if (isOptimizableIVTruncate(I, VF)) { 6145 auto *Trunc = cast<TruncInst>(I); 6146 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6147 Trunc->getSrcTy(), Trunc); 6148 } 6149 6150 Type *SrcScalarTy = I->getOperand(0)->getType(); 6151 Type *SrcVecTy = 6152 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6153 if (canTruncateToMinimalBitwidth(I, VF)) { 6154 // This cast is going to be shrunk. This may remove the cast or it might 6155 // turn it into slightly different cast. For example, if MinBW == 16, 6156 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6157 // 6158 // Calculate the modified src and dest types. 6159 Type *MinVecTy = VectorTy; 6160 if (I->getOpcode() == Instruction::Trunc) { 6161 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6162 VectorTy = 6163 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6164 } else if (I->getOpcode() == Instruction::ZExt || 6165 I->getOpcode() == Instruction::SExt) { 6166 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6167 VectorTy = 6168 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6169 } 6170 } 6171 6172 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6173 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); 6174 } 6175 case Instruction::Call: { 6176 bool NeedToScalarize; 6177 CallInst *CI = cast<CallInst>(I); 6178 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6179 if (getVectorIntrinsicIDForCall(CI, TLI)) 6180 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6181 return CallCost; 6182 } 6183 default: 6184 // The cost of executing VF copies of the scalar instruction. This opcode 6185 // is unknown. Assume that it is the same as 'mul'. 6186 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + 6187 getScalarizationOverhead(I, VF); 6188 } // end of switch. 6189 } 6190 6191 char LoopVectorize::ID = 0; 6192 6193 static const char lv_name[] = "Loop Vectorization"; 6194 6195 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6196 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6197 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6198 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6199 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6200 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6201 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6202 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6203 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6204 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6205 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6206 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6207 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6208 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6209 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6210 6211 namespace llvm { 6212 6213 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6214 6215 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6216 bool VectorizeOnlyWhenForced) { 6217 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6218 } 6219 6220 } // end namespace llvm 6221 6222 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6223 // Check if the pointer operand of a load or store instruction is 6224 // consecutive. 6225 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6226 return Legal->isConsecutivePtr(Ptr); 6227 return false; 6228 } 6229 6230 void LoopVectorizationCostModel::collectValuesToIgnore() { 6231 // Ignore ephemeral values. 6232 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6233 6234 // Ignore type-promoting instructions we identified during reduction 6235 // detection. 6236 for (auto &Reduction : *Legal->getReductionVars()) { 6237 RecurrenceDescriptor &RedDes = Reduction.second; 6238 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6239 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6240 } 6241 // Ignore type-casting instructions we identified during induction 6242 // detection. 6243 for (auto &Induction : *Legal->getInductionVars()) { 6244 InductionDescriptor &IndDes = Induction.second; 6245 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6246 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6247 } 6248 } 6249 6250 // TODO: we could return a pair of values that specify the max VF and 6251 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6252 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6253 // doesn't have a cost model that can choose which plan to execute if 6254 // more than one is generated. 6255 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6256 LoopVectorizationCostModel &CM) { 6257 unsigned WidestType; 6258 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6259 return WidestVectorRegBits / WidestType; 6260 } 6261 6262 VectorizationFactor 6263 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6264 unsigned VF = UserVF; 6265 // Outer loop handling: They may require CFG and instruction level 6266 // transformations before even evaluating whether vectorization is profitable. 6267 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6268 // the vectorization pipeline. 6269 if (!OrigLoop->empty()) { 6270 // If the user doesn't provide a vectorization factor, determine a 6271 // reasonable one. 6272 if (!UserVF) { 6273 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6274 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6275 6276 // Make sure we have a VF > 1 for stress testing. 6277 if (VPlanBuildStressTest && VF < 2) { 6278 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6279 << "overriding computed VF.\n"); 6280 VF = 4; 6281 } 6282 } 6283 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6284 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6285 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6286 << " to build VPlans.\n"); 6287 buildVPlans(VF, VF); 6288 6289 // For VPlan build stress testing, we bail out after VPlan construction. 6290 if (VPlanBuildStressTest) 6291 return VectorizationFactor::Disabled(); 6292 6293 return {VF, 0}; 6294 } 6295 6296 LLVM_DEBUG( 6297 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6298 "VPlan-native path.\n"); 6299 return VectorizationFactor::Disabled(); 6300 } 6301 6302 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { 6303 assert(OrigLoop->empty() && "Inner loop expected."); 6304 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); 6305 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6306 return None; 6307 6308 // Invalidate interleave groups if all blocks of loop will be predicated. 6309 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6310 !useMaskedInterleavedAccesses(*TTI)) { 6311 LLVM_DEBUG( 6312 dbgs() 6313 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6314 "which requires masked-interleaved support.\n"); 6315 CM.InterleaveInfo.reset(); 6316 } 6317 6318 if (UserVF) { 6319 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6320 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6321 // Collect the instructions (and their associated costs) that will be more 6322 // profitable to scalarize. 6323 CM.selectUserVectorizationFactor(UserVF); 6324 buildVPlansWithVPRecipes(UserVF, UserVF); 6325 LLVM_DEBUG(printPlans(dbgs())); 6326 return {{UserVF, 0}}; 6327 } 6328 6329 unsigned MaxVF = MaybeMaxVF.getValue(); 6330 assert(MaxVF != 0 && "MaxVF is zero."); 6331 6332 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6333 // Collect Uniform and Scalar instructions after vectorization with VF. 6334 CM.collectUniformsAndScalars(VF); 6335 6336 // Collect the instructions (and their associated costs) that will be more 6337 // profitable to scalarize. 6338 if (VF > 1) 6339 CM.collectInstsToScalarize(VF); 6340 } 6341 6342 buildVPlansWithVPRecipes(1, MaxVF); 6343 LLVM_DEBUG(printPlans(dbgs())); 6344 if (MaxVF == 1) 6345 return VectorizationFactor::Disabled(); 6346 6347 // Select the optimal vectorization factor. 6348 return CM.selectVectorizationFactor(MaxVF); 6349 } 6350 6351 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6352 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6353 << '\n'); 6354 BestVF = VF; 6355 BestUF = UF; 6356 6357 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6358 return !Plan->hasVF(VF); 6359 }); 6360 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6361 } 6362 6363 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6364 DominatorTree *DT) { 6365 // Perform the actual loop transformation. 6366 6367 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6368 VPCallbackILV CallbackILV(ILV); 6369 6370 VPTransformState State{BestVF, BestUF, LI, 6371 DT, ILV.Builder, ILV.VectorLoopValueMap, 6372 &ILV, CallbackILV}; 6373 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6374 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6375 6376 //===------------------------------------------------===// 6377 // 6378 // Notice: any optimization or new instruction that go 6379 // into the code below should also be implemented in 6380 // the cost-model. 6381 // 6382 //===------------------------------------------------===// 6383 6384 // 2. Copy and widen instructions from the old loop into the new loop. 6385 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6386 VPlans.front()->execute(&State); 6387 6388 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6389 // predication, updating analyses. 6390 ILV.fixVectorizedLoop(); 6391 } 6392 6393 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6394 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6395 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6396 6397 // We create new control-flow for the vectorized loop, so the original 6398 // condition will be dead after vectorization if it's only used by the 6399 // branch. 6400 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6401 if (Cmp && Cmp->hasOneUse()) 6402 DeadInstructions.insert(Cmp); 6403 6404 // We create new "steps" for induction variable updates to which the original 6405 // induction variables map. An original update instruction will be dead if 6406 // all its users except the induction variable are dead. 6407 for (auto &Induction : *Legal->getInductionVars()) { 6408 PHINode *Ind = Induction.first; 6409 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6410 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6411 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != 6412 DeadInstructions.end(); 6413 })) 6414 DeadInstructions.insert(IndUpdate); 6415 6416 // We record as "Dead" also the type-casting instructions we had identified 6417 // during induction analysis. We don't need any handling for them in the 6418 // vectorized loop because we have proven that, under a proper runtime 6419 // test guarding the vectorized loop, the value of the phi, and the casted 6420 // value of the phi, are the same. The last instruction in this casting chain 6421 // will get its scalar/vector/widened def from the scalar/vector/widened def 6422 // of the respective phi node. Any other casts in the induction def-use chain 6423 // have no other uses outside the phi update chain, and will be ignored. 6424 InductionDescriptor &IndDes = Induction.second; 6425 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6426 DeadInstructions.insert(Casts.begin(), Casts.end()); 6427 } 6428 } 6429 6430 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6431 6432 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6433 6434 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6435 Instruction::BinaryOps BinOp) { 6436 // When unrolling and the VF is 1, we only need to add a simple scalar. 6437 Type *Ty = Val->getType(); 6438 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6439 6440 if (Ty->isFloatingPointTy()) { 6441 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6442 6443 // Floating point operations had to be 'fast' to enable the unrolling. 6444 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6445 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6446 } 6447 Constant *C = ConstantInt::get(Ty, StartIdx); 6448 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6449 } 6450 6451 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6452 SmallVector<Metadata *, 4> MDs; 6453 // Reserve first location for self reference to the LoopID metadata node. 6454 MDs.push_back(nullptr); 6455 bool IsUnrollMetadata = false; 6456 MDNode *LoopID = L->getLoopID(); 6457 if (LoopID) { 6458 // First find existing loop unrolling disable metadata. 6459 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6460 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6461 if (MD) { 6462 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6463 IsUnrollMetadata = 6464 S && S->getString().startswith("llvm.loop.unroll.disable"); 6465 } 6466 MDs.push_back(LoopID->getOperand(i)); 6467 } 6468 } 6469 6470 if (!IsUnrollMetadata) { 6471 // Add runtime unroll disable metadata. 6472 LLVMContext &Context = L->getHeader()->getContext(); 6473 SmallVector<Metadata *, 1> DisableOperands; 6474 DisableOperands.push_back( 6475 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6476 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6477 MDs.push_back(DisableNode); 6478 MDNode *NewLoopID = MDNode::get(Context, MDs); 6479 // Set operand 0 to refer to the loop id itself. 6480 NewLoopID->replaceOperandWith(0, NewLoopID); 6481 L->setLoopID(NewLoopID); 6482 } 6483 } 6484 6485 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6486 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6487 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6488 bool PredicateAtRangeStart = Predicate(Range.Start); 6489 6490 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6491 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6492 Range.End = TmpVF; 6493 break; 6494 } 6495 6496 return PredicateAtRangeStart; 6497 } 6498 6499 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6500 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6501 /// of VF's starting at a given VF and extending it as much as possible. Each 6502 /// vectorization decision can potentially shorten this sub-range during 6503 /// buildVPlan(). 6504 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6505 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6506 VFRange SubRange = {VF, MaxVF + 1}; 6507 VPlans.push_back(buildVPlan(SubRange)); 6508 VF = SubRange.End; 6509 } 6510 } 6511 6512 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6513 VPlanPtr &Plan) { 6514 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6515 6516 // Look for cached value. 6517 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6518 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6519 if (ECEntryIt != EdgeMaskCache.end()) 6520 return ECEntryIt->second; 6521 6522 VPValue *SrcMask = createBlockInMask(Src, Plan); 6523 6524 // The terminator has to be a branch inst! 6525 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6526 assert(BI && "Unexpected terminator found"); 6527 6528 if (!BI->isConditional()) 6529 return EdgeMaskCache[Edge] = SrcMask; 6530 6531 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6532 assert(EdgeMask && "No Edge Mask found for condition"); 6533 6534 if (BI->getSuccessor(0) != Dst) 6535 EdgeMask = Builder.createNot(EdgeMask); 6536 6537 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6538 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6539 6540 return EdgeMaskCache[Edge] = EdgeMask; 6541 } 6542 6543 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6544 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6545 6546 // Look for cached value. 6547 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6548 if (BCEntryIt != BlockMaskCache.end()) 6549 return BCEntryIt->second; 6550 6551 // All-one mask is modelled as no-mask following the convention for masked 6552 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6553 VPValue *BlockMask = nullptr; 6554 6555 if (OrigLoop->getHeader() == BB) { 6556 if (!CM.blockNeedsPredication(BB)) 6557 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6558 6559 // Introduce the early-exit compare IV <= BTC to form header block mask. 6560 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6561 VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6562 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6563 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6564 return BlockMaskCache[BB] = BlockMask; 6565 } 6566 6567 // This is the block mask. We OR all incoming edges. 6568 for (auto *Predecessor : predecessors(BB)) { 6569 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6570 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6571 return BlockMaskCache[BB] = EdgeMask; 6572 6573 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6574 BlockMask = EdgeMask; 6575 continue; 6576 } 6577 6578 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6579 } 6580 6581 return BlockMaskCache[BB] = BlockMask; 6582 } 6583 6584 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, 6585 VFRange &Range, 6586 VPlanPtr &Plan) { 6587 const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I); 6588 if (!IG) 6589 return nullptr; 6590 6591 // Now check if IG is relevant for VF's in the given range. 6592 auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> { 6593 return [=](unsigned VF) -> bool { 6594 return (VF >= 2 && // Query is illegal for VF == 1 6595 CM.getWideningDecision(I, VF) == 6596 LoopVectorizationCostModel::CM_Interleave); 6597 }; 6598 }; 6599 if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range)) 6600 return nullptr; 6601 6602 // I is a member of an InterleaveGroup for VF's in the (possibly trimmed) 6603 // range. If it's the primary member of the IG construct a VPInterleaveRecipe. 6604 // Otherwise, it's an adjunct member of the IG, do not construct any Recipe. 6605 assert(I == IG->getInsertPos() && 6606 "Generating a recipe for an adjunct member of an interleave group"); 6607 6608 VPValue *Mask = nullptr; 6609 if (Legal->isMaskRequired(I)) 6610 Mask = createBlockInMask(I->getParent(), Plan); 6611 6612 return new VPInterleaveRecipe(IG, Mask); 6613 } 6614 6615 VPWidenMemoryInstructionRecipe * 6616 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6617 VPlanPtr &Plan) { 6618 if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) 6619 return nullptr; 6620 6621 auto willWiden = [&](unsigned VF) -> bool { 6622 if (VF == 1) 6623 return false; 6624 if (CM.isScalarAfterVectorization(I, VF) || 6625 CM.isProfitableToScalarize(I, VF)) 6626 return false; 6627 LoopVectorizationCostModel::InstWidening Decision = 6628 CM.getWideningDecision(I, VF); 6629 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6630 "CM decision should be taken at this point."); 6631 assert(Decision != LoopVectorizationCostModel::CM_Interleave && 6632 "Interleave memory opportunity should be caught earlier."); 6633 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6634 }; 6635 6636 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6637 return nullptr; 6638 6639 VPValue *Mask = nullptr; 6640 if (Legal->isMaskRequired(I)) 6641 Mask = createBlockInMask(I->getParent(), Plan); 6642 6643 return new VPWidenMemoryInstructionRecipe(*I, Mask); 6644 } 6645 6646 VPWidenIntOrFpInductionRecipe * 6647 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { 6648 if (PHINode *Phi = dyn_cast<PHINode>(I)) { 6649 // Check if this is an integer or fp induction. If so, build the recipe that 6650 // produces its scalar and vector values. 6651 InductionDescriptor II = Legal->getInductionVars()->lookup(Phi); 6652 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6653 II.getKind() == InductionDescriptor::IK_FpInduction) 6654 return new VPWidenIntOrFpInductionRecipe(Phi); 6655 6656 return nullptr; 6657 } 6658 6659 // Optimize the special case where the source is a constant integer 6660 // induction variable. Notice that we can only optimize the 'trunc' case 6661 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6662 // (c) other casts depend on pointer size. 6663 6664 // Determine whether \p K is a truncation based on an induction variable that 6665 // can be optimized. 6666 auto isOptimizableIVTruncate = 6667 [&](Instruction *K) -> std::function<bool(unsigned)> { 6668 return 6669 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6670 }; 6671 6672 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( 6673 isOptimizableIVTruncate(I), Range)) 6674 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6675 cast<TruncInst>(I)); 6676 return nullptr; 6677 } 6678 6679 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { 6680 PHINode *Phi = dyn_cast<PHINode>(I); 6681 if (!Phi || Phi->getParent() == OrigLoop->getHeader()) 6682 return nullptr; 6683 6684 // We know that all PHIs in non-header blocks are converted into selects, so 6685 // we don't have to worry about the insertion order and we can just use the 6686 // builder. At this point we generate the predication tree. There may be 6687 // duplications since this is a simple recursive scan, but future 6688 // optimizations will clean it up. 6689 6690 SmallVector<VPValue *, 2> Masks; 6691 unsigned NumIncoming = Phi->getNumIncomingValues(); 6692 for (unsigned In = 0; In < NumIncoming; In++) { 6693 VPValue *EdgeMask = 6694 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6695 assert((EdgeMask || NumIncoming == 1) && 6696 "Multiple predecessors with one having a full mask"); 6697 if (EdgeMask) 6698 Masks.push_back(EdgeMask); 6699 } 6700 return new VPBlendRecipe(Phi, Masks); 6701 } 6702 6703 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, 6704 VFRange &Range) { 6705 6706 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6707 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6708 6709 if (IsPredicated) 6710 return false; 6711 6712 auto IsVectorizableOpcode = [](unsigned Opcode) { 6713 switch (Opcode) { 6714 case Instruction::Add: 6715 case Instruction::And: 6716 case Instruction::AShr: 6717 case Instruction::BitCast: 6718 case Instruction::Br: 6719 case Instruction::Call: 6720 case Instruction::FAdd: 6721 case Instruction::FCmp: 6722 case Instruction::FDiv: 6723 case Instruction::FMul: 6724 case Instruction::FNeg: 6725 case Instruction::FPExt: 6726 case Instruction::FPToSI: 6727 case Instruction::FPToUI: 6728 case Instruction::FPTrunc: 6729 case Instruction::FRem: 6730 case Instruction::FSub: 6731 case Instruction::GetElementPtr: 6732 case Instruction::ICmp: 6733 case Instruction::IntToPtr: 6734 case Instruction::Load: 6735 case Instruction::LShr: 6736 case Instruction::Mul: 6737 case Instruction::Or: 6738 case Instruction::PHI: 6739 case Instruction::PtrToInt: 6740 case Instruction::SDiv: 6741 case Instruction::Select: 6742 case Instruction::SExt: 6743 case Instruction::Shl: 6744 case Instruction::SIToFP: 6745 case Instruction::SRem: 6746 case Instruction::Store: 6747 case Instruction::Sub: 6748 case Instruction::Trunc: 6749 case Instruction::UDiv: 6750 case Instruction::UIToFP: 6751 case Instruction::URem: 6752 case Instruction::Xor: 6753 case Instruction::ZExt: 6754 return true; 6755 } 6756 return false; 6757 }; 6758 6759 if (!IsVectorizableOpcode(I->getOpcode())) 6760 return false; 6761 6762 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6763 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6764 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6765 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6766 return false; 6767 } 6768 6769 auto willWiden = [&](unsigned VF) -> bool { 6770 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || 6771 CM.isProfitableToScalarize(I, VF))) 6772 return false; 6773 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6774 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6775 // The following case may be scalarized depending on the VF. 6776 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6777 // version of the instruction. 6778 // Is it beneficial to perform intrinsic call compared to lib call? 6779 bool NeedToScalarize; 6780 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6781 bool UseVectorIntrinsic = 6782 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6783 return UseVectorIntrinsic || !NeedToScalarize; 6784 } 6785 if (isa<LoadInst>(I) || isa<StoreInst>(I)) { 6786 assert(CM.getWideningDecision(I, VF) == 6787 LoopVectorizationCostModel::CM_Scalarize && 6788 "Memory widening decisions should have been taken care by now"); 6789 return false; 6790 } 6791 return true; 6792 }; 6793 6794 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6795 return false; 6796 6797 // Success: widen this instruction. We optimize the common case where 6798 // consecutive instructions can be represented by a single recipe. 6799 if (!VPBB->empty()) { 6800 VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back()); 6801 if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I)) 6802 return true; 6803 } 6804 6805 VPBB->appendRecipe(new VPWidenRecipe(I)); 6806 return true; 6807 } 6808 6809 VPBasicBlock *VPRecipeBuilder::handleReplication( 6810 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 6811 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 6812 VPlanPtr &Plan) { 6813 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 6814 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 6815 Range); 6816 6817 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6818 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6819 6820 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); 6821 6822 // Find if I uses a predicated instruction. If so, it will use its scalar 6823 // value. Avoid hoisting the insert-element which packs the scalar value into 6824 // a vector value, as that happens iff all users use the vector value. 6825 for (auto &Op : I->operands()) 6826 if (auto *PredInst = dyn_cast<Instruction>(Op)) 6827 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 6828 PredInst2Recipe[PredInst]->setAlsoPack(false); 6829 6830 // Finalize the recipe for Instr, first if it is not predicated. 6831 if (!IsPredicated) { 6832 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 6833 VPBB->appendRecipe(Recipe); 6834 return VPBB; 6835 } 6836 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 6837 assert(VPBB->getSuccessors().empty() && 6838 "VPBB has successors when handling predicated replication."); 6839 // Record predicated instructions for above packing optimizations. 6840 PredInst2Recipe[I] = Recipe; 6841 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 6842 VPBlockUtils::insertBlockAfter(Region, VPBB); 6843 auto *RegSucc = new VPBasicBlock(); 6844 VPBlockUtils::insertBlockAfter(RegSucc, Region); 6845 return RegSucc; 6846 } 6847 6848 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 6849 VPRecipeBase *PredRecipe, 6850 VPlanPtr &Plan) { 6851 // Instructions marked for predication are replicated and placed under an 6852 // if-then construct to prevent side-effects. 6853 6854 // Generate recipes to compute the block mask for this region. 6855 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 6856 6857 // Build the triangular if-then region. 6858 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 6859 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 6860 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 6861 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 6862 auto *PHIRecipe = 6863 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 6864 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 6865 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 6866 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 6867 6868 // Note: first set Entry as region entry and then connect successors starting 6869 // from it in order, to propagate the "parent" of each VPBasicBlock. 6870 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 6871 VPBlockUtils::connectBlocks(Pred, Exit); 6872 6873 return Region; 6874 } 6875 6876 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, 6877 VPlanPtr &Plan, VPBasicBlock *VPBB) { 6878 VPRecipeBase *Recipe = nullptr; 6879 // Check if Instr should belong to an interleave memory recipe, or already 6880 // does. In the latter case Instr is irrelevant. 6881 if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) { 6882 VPBB->appendRecipe(Recipe); 6883 return true; 6884 } 6885 6886 // Check if Instr is a memory operation that should be widened. 6887 if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) { 6888 VPBB->appendRecipe(Recipe); 6889 return true; 6890 } 6891 6892 // Check if Instr should form some PHI recipe. 6893 if ((Recipe = tryToOptimizeInduction(Instr, Range))) { 6894 VPBB->appendRecipe(Recipe); 6895 return true; 6896 } 6897 if ((Recipe = tryToBlend(Instr, Plan))) { 6898 VPBB->appendRecipe(Recipe); 6899 return true; 6900 } 6901 if (PHINode *Phi = dyn_cast<PHINode>(Instr)) { 6902 VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); 6903 return true; 6904 } 6905 6906 // Check if Instr is to be widened by a general VPWidenRecipe, after 6907 // having first checked for specific widening recipes that deal with 6908 // Interleave Groups, Inductions and Phi nodes. 6909 if (tryToWiden(Instr, VPBB, Range)) 6910 return true; 6911 6912 return false; 6913 } 6914 6915 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 6916 unsigned MaxVF) { 6917 assert(OrigLoop->empty() && "Inner loop expected."); 6918 6919 // Collect conditions feeding internal conditional branches; they need to be 6920 // represented in VPlan for it to model masking. 6921 SmallPtrSet<Value *, 1> NeedDef; 6922 6923 auto *Latch = OrigLoop->getLoopLatch(); 6924 for (BasicBlock *BB : OrigLoop->blocks()) { 6925 if (BB == Latch) 6926 continue; 6927 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 6928 if (Branch && Branch->isConditional()) 6929 NeedDef.insert(Branch->getCondition()); 6930 } 6931 6932 // If the tail is to be folded by masking, the primary induction variable 6933 // needs to be represented in VPlan for it to model early-exit masking. 6934 if (CM.foldTailByMasking()) 6935 NeedDef.insert(Legal->getPrimaryInduction()); 6936 6937 // Collect instructions from the original loop that will become trivially dead 6938 // in the vectorized loop. We don't need to vectorize these instructions. For 6939 // example, original induction update instructions can become dead because we 6940 // separately emit induction "steps" when generating code for the new loop. 6941 // Similarly, we create a new latch condition when setting up the structure 6942 // of the new loop, so the old one can become dead. 6943 SmallPtrSet<Instruction *, 4> DeadInstructions; 6944 collectTriviallyDeadInstructions(DeadInstructions); 6945 6946 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6947 VFRange SubRange = {VF, MaxVF + 1}; 6948 VPlans.push_back( 6949 buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions)); 6950 VF = SubRange.End; 6951 } 6952 } 6953 6954 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 6955 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 6956 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6957 // Hold a mapping from predicated instructions to their recipes, in order to 6958 // fix their AlsoPack behavior if a user is determined to replicate and use a 6959 // scalar instead of vector value. 6960 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 6961 6962 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 6963 DenseMap<Instruction *, Instruction *> SinkAfterInverse; 6964 6965 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 6966 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 6967 auto Plan = llvm::make_unique<VPlan>(VPBB); 6968 6969 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); 6970 // Represent values that will have defs inside VPlan. 6971 for (Value *V : NeedDef) 6972 Plan->addVPValue(V); 6973 6974 // Scan the body of the loop in a topological order to visit each basic block 6975 // after having visited its predecessor basic blocks. 6976 LoopBlocksDFS DFS(OrigLoop); 6977 DFS.perform(LI); 6978 6979 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 6980 // Relevant instructions from basic block BB will be grouped into VPRecipe 6981 // ingredients and fill a new VPBasicBlock. 6982 unsigned VPBBsForBB = 0; 6983 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 6984 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 6985 VPBB = FirstVPBBForBB; 6986 Builder.setInsertPoint(VPBB); 6987 6988 std::vector<Instruction *> Ingredients; 6989 6990 // Organize the ingredients to vectorize from current basic block in the 6991 // right order. 6992 for (Instruction &I : BB->instructionsWithoutDebug()) { 6993 Instruction *Instr = &I; 6994 6995 // First filter out irrelevant instructions, to ensure no recipes are 6996 // built for them. 6997 if (isa<BranchInst>(Instr) || 6998 DeadInstructions.find(Instr) != DeadInstructions.end()) 6999 continue; 7000 7001 // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct 7002 // member of the IG, do not construct any Recipe for it. 7003 const InterleaveGroup<Instruction> *IG = 7004 CM.getInterleavedAccessGroup(Instr); 7005 if (IG && Instr != IG->getInsertPos() && 7006 Range.Start >= 2 && // Query is illegal for VF == 1 7007 CM.getWideningDecision(Instr, Range.Start) == 7008 LoopVectorizationCostModel::CM_Interleave) { 7009 auto SinkCandidate = SinkAfterInverse.find(Instr); 7010 if (SinkCandidate != SinkAfterInverse.end()) 7011 Ingredients.push_back(SinkCandidate->second); 7012 continue; 7013 } 7014 7015 // Move instructions to handle first-order recurrences, step 1: avoid 7016 // handling this instruction until after we've handled the instruction it 7017 // should follow. 7018 auto SAIt = SinkAfter.find(Instr); 7019 if (SAIt != SinkAfter.end()) { 7020 LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" 7021 << *SAIt->second 7022 << " to vectorize a 1st order recurrence.\n"); 7023 SinkAfterInverse[SAIt->second] = Instr; 7024 continue; 7025 } 7026 7027 Ingredients.push_back(Instr); 7028 7029 // Move instructions to handle first-order recurrences, step 2: push the 7030 // instruction to be sunk at its insertion point. 7031 auto SAInvIt = SinkAfterInverse.find(Instr); 7032 if (SAInvIt != SinkAfterInverse.end()) 7033 Ingredients.push_back(SAInvIt->second); 7034 } 7035 7036 // Introduce each ingredient into VPlan. 7037 for (Instruction *Instr : Ingredients) { 7038 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) 7039 continue; 7040 7041 // Otherwise, if all widening options failed, Instruction is to be 7042 // replicated. This may create a successor for VPBB. 7043 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7044 Instr, Range, VPBB, PredInst2Recipe, Plan); 7045 if (NextVPBB != VPBB) { 7046 VPBB = NextVPBB; 7047 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7048 : ""); 7049 } 7050 } 7051 } 7052 7053 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7054 // may also be empty, such as the last one VPBB, reflecting original 7055 // basic-blocks with no recipes. 7056 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7057 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7058 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7059 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7060 delete PreEntry; 7061 7062 std::string PlanName; 7063 raw_string_ostream RSO(PlanName); 7064 unsigned VF = Range.Start; 7065 Plan->addVF(VF); 7066 RSO << "Initial VPlan for VF={" << VF; 7067 for (VF *= 2; VF < Range.End; VF *= 2) { 7068 Plan->addVF(VF); 7069 RSO << "," << VF; 7070 } 7071 RSO << "},UF>=1"; 7072 RSO.flush(); 7073 Plan->setName(PlanName); 7074 7075 return Plan; 7076 } 7077 7078 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7079 // Outer loop handling: They may require CFG and instruction level 7080 // transformations before even evaluating whether vectorization is profitable. 7081 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7082 // the vectorization pipeline. 7083 assert(!OrigLoop->empty()); 7084 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7085 7086 // Create new empty VPlan 7087 auto Plan = llvm::make_unique<VPlan>(); 7088 7089 // Build hierarchical CFG 7090 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7091 HCFGBuilder.buildHierarchicalCFG(); 7092 7093 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7094 Plan->addVF(VF); 7095 7096 if (EnableVPlanPredication) { 7097 VPlanPredicator VPP(*Plan); 7098 VPP.predicate(); 7099 7100 // Avoid running transformation to recipes until masked code generation in 7101 // VPlan-native path is in place. 7102 return Plan; 7103 } 7104 7105 SmallPtrSet<Instruction *, 1> DeadInstructions; 7106 VPlanHCFGTransforms::VPInstructionsToVPRecipes( 7107 Plan, Legal->getInductionVars(), DeadInstructions); 7108 7109 return Plan; 7110 } 7111 7112 Value* LoopVectorizationPlanner::VPCallbackILV:: 7113 getOrCreateVectorValues(Value *V, unsigned Part) { 7114 return ILV.getOrCreateVectorValue(V, Part); 7115 } 7116 7117 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { 7118 O << " +\n" 7119 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7120 IG->getInsertPos()->printAsOperand(O, false); 7121 if (User) { 7122 O << ", "; 7123 User->getOperand(0)->printAsOperand(O); 7124 } 7125 O << "\\l\""; 7126 for (unsigned i = 0; i < IG->getFactor(); ++i) 7127 if (Instruction *I = IG->getMember(i)) 7128 O << " +\n" 7129 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; 7130 } 7131 7132 void VPWidenRecipe::execute(VPTransformState &State) { 7133 for (auto &Instr : make_range(Begin, End)) 7134 State.ILV->widenInstruction(Instr); 7135 } 7136 7137 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7138 assert(!State.Instance && "Int or FP induction being replicated."); 7139 State.ILV->widenIntOrFpInduction(IV, Trunc); 7140 } 7141 7142 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7143 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7144 } 7145 7146 void VPBlendRecipe::execute(VPTransformState &State) { 7147 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7148 // We know that all PHIs in non-header blocks are converted into 7149 // selects, so we don't have to worry about the insertion order and we 7150 // can just use the builder. 7151 // At this point we generate the predication tree. There may be 7152 // duplications since this is a simple recursive scan, but future 7153 // optimizations will clean it up. 7154 7155 unsigned NumIncoming = Phi->getNumIncomingValues(); 7156 7157 assert((User || NumIncoming == 1) && 7158 "Multiple predecessors with predecessors having a full mask"); 7159 // Generate a sequence of selects of the form: 7160 // SELECT(Mask3, In3, 7161 // SELECT(Mask2, In2, 7162 // ( ...))) 7163 InnerLoopVectorizer::VectorParts Entry(State.UF); 7164 for (unsigned In = 0; In < NumIncoming; ++In) { 7165 for (unsigned Part = 0; Part < State.UF; ++Part) { 7166 // We might have single edge PHIs (blocks) - use an identity 7167 // 'select' for the first PHI operand. 7168 Value *In0 = 7169 State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part); 7170 if (In == 0) 7171 Entry[Part] = In0; // Initialize with the first incoming value. 7172 else { 7173 // Select between the current value and the previous incoming edge 7174 // based on the incoming mask. 7175 Value *Cond = State.get(User->getOperand(In), Part); 7176 Entry[Part] = 7177 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7178 } 7179 } 7180 } 7181 for (unsigned Part = 0; Part < State.UF; ++Part) 7182 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7183 } 7184 7185 void VPInterleaveRecipe::execute(VPTransformState &State) { 7186 assert(!State.Instance && "Interleave group being replicated."); 7187 if (!User) 7188 return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); 7189 7190 // Last (and currently only) operand is a mask. 7191 InnerLoopVectorizer::VectorParts MaskValues(State.UF); 7192 VPValue *Mask = User->getOperand(User->getNumOperands() - 1); 7193 for (unsigned Part = 0; Part < State.UF; ++Part) 7194 MaskValues[Part] = State.get(Mask, Part); 7195 State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues); 7196 } 7197 7198 void VPReplicateRecipe::execute(VPTransformState &State) { 7199 if (State.Instance) { // Generate a single instance. 7200 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); 7201 // Insert scalar instance packing it into a vector. 7202 if (AlsoPack && State.VF > 1) { 7203 // If we're constructing lane 0, initialize to start from undef. 7204 if (State.Instance->Lane == 0) { 7205 Value *Undef = 7206 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7207 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7208 } 7209 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7210 } 7211 return; 7212 } 7213 7214 // Generate scalar instances for all VF lanes of all UF parts, unless the 7215 // instruction is uniform inwhich case generate only the first lane for each 7216 // of the UF parts. 7217 unsigned EndLane = IsUniform ? 1 : State.VF; 7218 for (unsigned Part = 0; Part < State.UF; ++Part) 7219 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7220 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); 7221 } 7222 7223 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7224 assert(State.Instance && "Branch on Mask works only on single instance."); 7225 7226 unsigned Part = State.Instance->Part; 7227 unsigned Lane = State.Instance->Lane; 7228 7229 Value *ConditionBit = nullptr; 7230 if (!User) // Block in mask is all-one. 7231 ConditionBit = State.Builder.getTrue(); 7232 else { 7233 VPValue *BlockInMask = User->getOperand(0); 7234 ConditionBit = State.get(BlockInMask, Part); 7235 if (ConditionBit->getType()->isVectorTy()) 7236 ConditionBit = State.Builder.CreateExtractElement( 7237 ConditionBit, State.Builder.getInt32(Lane)); 7238 } 7239 7240 // Replace the temporary unreachable terminator with a new conditional branch, 7241 // whose two destinations will be set later when they are created. 7242 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7243 assert(isa<UnreachableInst>(CurrentTerminator) && 7244 "Expected to replace unreachable terminator with conditional branch."); 7245 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7246 CondBr->setSuccessor(0, nullptr); 7247 ReplaceInstWithInst(CurrentTerminator, CondBr); 7248 } 7249 7250 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7251 assert(State.Instance && "Predicated instruction PHI works per instance."); 7252 Instruction *ScalarPredInst = cast<Instruction>( 7253 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7254 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7255 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7256 assert(PredicatingBB && "Predicated block has no single predecessor."); 7257 7258 // By current pack/unpack logic we need to generate only a single phi node: if 7259 // a vector value for the predicated instruction exists at this point it means 7260 // the instruction has vector users only, and a phi for the vector value is 7261 // needed. In this case the recipe of the predicated instruction is marked to 7262 // also do that packing, thereby "hoisting" the insert-element sequence. 7263 // Otherwise, a phi node for the scalar value is needed. 7264 unsigned Part = State.Instance->Part; 7265 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7266 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7267 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7268 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7269 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7270 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7271 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7272 } else { 7273 Type *PredInstType = PredInst->getType(); 7274 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7275 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7276 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7277 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7278 } 7279 } 7280 7281 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7282 if (!User) 7283 return State.ILV->vectorizeMemoryInstruction(&Instr); 7284 7285 // Last (and currently only) operand is a mask. 7286 InnerLoopVectorizer::VectorParts MaskValues(State.UF); 7287 VPValue *Mask = User->getOperand(User->getNumOperands() - 1); 7288 for (unsigned Part = 0; Part < State.UF; ++Part) 7289 MaskValues[Part] = State.get(Mask, Part); 7290 State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); 7291 } 7292 7293 static ScalarEpilogueLowering 7294 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, 7295 ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { 7296 ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; 7297 if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && 7298 (F->hasOptSize() || 7299 llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) 7300 SEL = CM_ScalarEpilogueNotAllowedOptSize; 7301 else if (Hints.getPredicate()) 7302 SEL = CM_ScalarEpilogueNotNeededPredicatePragma; 7303 7304 return SEL; 7305 } 7306 7307 // Process the loop in the VPlan-native vectorization path. This path builds 7308 // VPlan upfront in the vectorization pipeline, which allows to apply 7309 // VPlan-to-VPlan transformations from the very beginning without modifying the 7310 // input LLVM IR. 7311 static bool processLoopInVPlanNativePath( 7312 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7313 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7314 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7315 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7316 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7317 7318 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7319 Function *F = L->getHeader()->getParent(); 7320 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7321 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); 7322 7323 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7324 &Hints, IAI); 7325 // Use the planner for outer loop vectorization. 7326 // TODO: CM is not used at this point inside the planner. Turn CM into an 7327 // optional argument if we don't need it in the future. 7328 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM); 7329 7330 // Get user vectorization factor. 7331 const unsigned UserVF = Hints.getWidth(); 7332 7333 // Plan how to best vectorize, return the best VF and its cost. 7334 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7335 7336 // If we are stress testing VPlan builds, do not attempt to generate vector 7337 // code. Masked vector code generation support will follow soon. 7338 // Also, do not attempt to vectorize if no vector code will be produced. 7339 if (VPlanBuildStressTest || EnableVPlanPredication || 7340 VectorizationFactor::Disabled() == VF) 7341 return false; 7342 7343 LVP.setBestPlan(VF.Width, 1); 7344 7345 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7346 &CM); 7347 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7348 << L->getHeader()->getParent()->getName() << "\"\n"); 7349 LVP.executePlan(LB, DT); 7350 7351 // Mark the loop as already vectorized to avoid vectorizing again. 7352 Hints.setAlreadyVectorized(); 7353 7354 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7355 return true; 7356 } 7357 7358 bool LoopVectorizePass::processLoop(Loop *L) { 7359 assert((EnableVPlanNativePath || L->empty()) && 7360 "VPlan-native path is not enabled. Only process inner loops."); 7361 7362 #ifndef NDEBUG 7363 const std::string DebugLocStr = getDebugLocString(L); 7364 #endif /* NDEBUG */ 7365 7366 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7367 << L->getHeader()->getParent()->getName() << "\" from " 7368 << DebugLocStr << "\n"); 7369 7370 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7371 7372 LLVM_DEBUG( 7373 dbgs() << "LV: Loop hints:" 7374 << " force=" 7375 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7376 ? "disabled" 7377 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7378 ? "enabled" 7379 : "?")) 7380 << " width=" << Hints.getWidth() 7381 << " unroll=" << Hints.getInterleave() << "\n"); 7382 7383 // Function containing loop 7384 Function *F = L->getHeader()->getParent(); 7385 7386 // Looking at the diagnostic output is the only way to determine if a loop 7387 // was vectorized (other than looking at the IR or machine code), so it 7388 // is important to generate an optimization remark for each loop. Most of 7389 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7390 // generated as OptimizationRemark and OptimizationRemarkMissed are 7391 // less verbose reporting vectorized loops and unvectorized loops that may 7392 // benefit from vectorization, respectively. 7393 7394 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7395 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7396 return false; 7397 } 7398 7399 PredicatedScalarEvolution PSE(*SE, *L); 7400 7401 // Check if it is legal to vectorize the loop. 7402 LoopVectorizationRequirements Requirements(*ORE); 7403 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7404 &Requirements, &Hints, DB, AC); 7405 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7406 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7407 Hints.emitRemarkWithHints(); 7408 return false; 7409 } 7410 7411 // Check the function attributes and profiles to find out if this function 7412 // should be optimized for size. 7413 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); 7414 7415 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7416 // here. They may require CFG and instruction level transformations before 7417 // even evaluating whether vectorization is profitable. Since we cannot modify 7418 // the incoming IR, we need to build VPlan upfront in the vectorization 7419 // pipeline. 7420 if (!L->empty()) 7421 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7422 ORE, BFI, PSI, Hints); 7423 7424 assert(L->empty() && "Inner loop expected."); 7425 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7426 // count by optimizing for size, to minimize overheads. 7427 // Prefer constant trip counts over profile data, over upper bound estimate. 7428 unsigned ExpectedTC = 0; 7429 bool HasExpectedTC = false; 7430 if (const SCEVConstant *ConstExits = 7431 dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) { 7432 const APInt &ExitsCount = ConstExits->getAPInt(); 7433 // We are interested in small values for ExpectedTC. Skip over those that 7434 // can't fit an unsigned. 7435 if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) { 7436 ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1; 7437 HasExpectedTC = true; 7438 } 7439 } 7440 // ExpectedTC may be large because it's bound by a variable. Check 7441 // profiling information to validate we should vectorize. 7442 if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) { 7443 auto EstimatedTC = getLoopEstimatedTripCount(L); 7444 if (EstimatedTC) { 7445 ExpectedTC = *EstimatedTC; 7446 HasExpectedTC = true; 7447 } 7448 } 7449 if (!HasExpectedTC) { 7450 ExpectedTC = SE->getSmallConstantMaxTripCount(L); 7451 HasExpectedTC = (ExpectedTC > 0); 7452 } 7453 7454 if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) { 7455 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7456 << "This loop is worth vectorizing only if no scalar " 7457 << "iteration overheads are incurred."); 7458 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7459 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7460 else { 7461 LLVM_DEBUG(dbgs() << "\n"); 7462 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7463 } 7464 } 7465 7466 // Check the function attributes to see if implicit floats are allowed. 7467 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7468 // an integer loop and the vector instructions selected are purely integer 7469 // vector instructions? 7470 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7471 reportVectorizationFailure( 7472 "Can't vectorize when the NoImplicitFloat attribute is used", 7473 "loop not vectorized due to NoImplicitFloat attribute", 7474 "NoImplicitFloat", ORE, L); 7475 Hints.emitRemarkWithHints(); 7476 return false; 7477 } 7478 7479 // Check if the target supports potentially unsafe FP vectorization. 7480 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7481 // for the target we're vectorizing for, to make sure none of the 7482 // additional fp-math flags can help. 7483 if (Hints.isPotentiallyUnsafe() && 7484 TTI->isFPVectorizationPotentiallyUnsafe()) { 7485 reportVectorizationFailure( 7486 "Potentially unsafe FP op prevents vectorization", 7487 "loop not vectorized due to unsafe FP support.", 7488 "UnsafeFP", ORE, L); 7489 Hints.emitRemarkWithHints(); 7490 return false; 7491 } 7492 7493 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7494 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7495 7496 // If an override option has been passed in for interleaved accesses, use it. 7497 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7498 UseInterleaved = EnableInterleavedMemAccesses; 7499 7500 // Analyze interleaved memory accesses. 7501 if (UseInterleaved) { 7502 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7503 } 7504 7505 // Use the cost model. 7506 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7507 F, &Hints, IAI); 7508 CM.collectValuesToIgnore(); 7509 7510 // Use the planner for vectorization. 7511 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM); 7512 7513 // Get user vectorization factor. 7514 unsigned UserVF = Hints.getWidth(); 7515 7516 // Plan how to best vectorize, return the best VF and its cost. 7517 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); 7518 7519 VectorizationFactor VF = VectorizationFactor::Disabled(); 7520 unsigned IC = 1; 7521 unsigned UserIC = Hints.getInterleave(); 7522 7523 if (MaybeVF) { 7524 VF = *MaybeVF; 7525 // Select the interleave count. 7526 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7527 } 7528 7529 // Identify the diagnostic messages that should be produced. 7530 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7531 bool VectorizeLoop = true, InterleaveLoop = true; 7532 if (Requirements.doesNotMeet(F, L, Hints)) { 7533 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7534 "requirements.\n"); 7535 Hints.emitRemarkWithHints(); 7536 return false; 7537 } 7538 7539 if (VF.Width == 1) { 7540 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7541 VecDiagMsg = std::make_pair( 7542 "VectorizationNotBeneficial", 7543 "the cost-model indicates that vectorization is not beneficial"); 7544 VectorizeLoop = false; 7545 } 7546 7547 if (!MaybeVF && UserIC > 1) { 7548 // Tell the user interleaving was avoided up-front, despite being explicitly 7549 // requested. 7550 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7551 "interleaving should be avoided up front\n"); 7552 IntDiagMsg = std::make_pair( 7553 "InterleavingAvoided", 7554 "Ignoring UserIC, because interleaving was avoided up front"); 7555 InterleaveLoop = false; 7556 } else if (IC == 1 && UserIC <= 1) { 7557 // Tell the user interleaving is not beneficial. 7558 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7559 IntDiagMsg = std::make_pair( 7560 "InterleavingNotBeneficial", 7561 "the cost-model indicates that interleaving is not beneficial"); 7562 InterleaveLoop = false; 7563 if (UserIC == 1) { 7564 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7565 IntDiagMsg.second += 7566 " and is explicitly disabled or interleave count is set to 1"; 7567 } 7568 } else if (IC > 1 && UserIC == 1) { 7569 // Tell the user interleaving is beneficial, but it explicitly disabled. 7570 LLVM_DEBUG( 7571 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7572 IntDiagMsg = std::make_pair( 7573 "InterleavingBeneficialButDisabled", 7574 "the cost-model indicates that interleaving is beneficial " 7575 "but is explicitly disabled or interleave count is set to 1"); 7576 InterleaveLoop = false; 7577 } 7578 7579 // Override IC if user provided an interleave count. 7580 IC = UserIC > 0 ? UserIC : IC; 7581 7582 // Emit diagnostic messages, if any. 7583 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7584 if (!VectorizeLoop && !InterleaveLoop) { 7585 // Do not vectorize or interleaving the loop. 7586 ORE->emit([&]() { 7587 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7588 L->getStartLoc(), L->getHeader()) 7589 << VecDiagMsg.second; 7590 }); 7591 ORE->emit([&]() { 7592 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7593 L->getStartLoc(), L->getHeader()) 7594 << IntDiagMsg.second; 7595 }); 7596 return false; 7597 } else if (!VectorizeLoop && InterleaveLoop) { 7598 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7599 ORE->emit([&]() { 7600 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7601 L->getStartLoc(), L->getHeader()) 7602 << VecDiagMsg.second; 7603 }); 7604 } else if (VectorizeLoop && !InterleaveLoop) { 7605 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7606 << ") in " << DebugLocStr << '\n'); 7607 ORE->emit([&]() { 7608 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7609 L->getStartLoc(), L->getHeader()) 7610 << IntDiagMsg.second; 7611 }); 7612 } else if (VectorizeLoop && InterleaveLoop) { 7613 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7614 << ") in " << DebugLocStr << '\n'); 7615 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7616 } 7617 7618 LVP.setBestPlan(VF.Width, IC); 7619 7620 using namespace ore; 7621 bool DisableRuntimeUnroll = false; 7622 MDNode *OrigLoopID = L->getLoopID(); 7623 7624 if (!VectorizeLoop) { 7625 assert(IC > 1 && "interleave count should not be 1 or 0"); 7626 // If we decided that it is not legal to vectorize the loop, then 7627 // interleave it. 7628 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7629 &CM); 7630 LVP.executePlan(Unroller, DT); 7631 7632 ORE->emit([&]() { 7633 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7634 L->getHeader()) 7635 << "interleaved loop (interleaved count: " 7636 << NV("InterleaveCount", IC) << ")"; 7637 }); 7638 } else { 7639 // If we decided that it is *legal* to vectorize the loop, then do it. 7640 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7641 &LVL, &CM); 7642 LVP.executePlan(LB, DT); 7643 ++LoopsVectorized; 7644 7645 // Add metadata to disable runtime unrolling a scalar loop when there are 7646 // no runtime checks about strides and memory. A scalar loop that is 7647 // rarely used is not worth unrolling. 7648 if (!LB.areSafetyChecksAdded()) 7649 DisableRuntimeUnroll = true; 7650 7651 // Report the vectorization decision. 7652 ORE->emit([&]() { 7653 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7654 L->getHeader()) 7655 << "vectorized loop (vectorization width: " 7656 << NV("VectorizationFactor", VF.Width) 7657 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7658 }); 7659 } 7660 7661 Optional<MDNode *> RemainderLoopID = 7662 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7663 LLVMLoopVectorizeFollowupEpilogue}); 7664 if (RemainderLoopID.hasValue()) { 7665 L->setLoopID(RemainderLoopID.getValue()); 7666 } else { 7667 if (DisableRuntimeUnroll) 7668 AddRuntimeUnrollDisableMetaData(L); 7669 7670 // Mark the loop as already vectorized to avoid vectorizing again. 7671 Hints.setAlreadyVectorized(); 7672 } 7673 7674 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7675 return true; 7676 } 7677 7678 bool LoopVectorizePass::runImpl( 7679 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7680 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7681 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 7682 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 7683 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 7684 SE = &SE_; 7685 LI = &LI_; 7686 TTI = &TTI_; 7687 DT = &DT_; 7688 BFI = &BFI_; 7689 TLI = TLI_; 7690 AA = &AA_; 7691 AC = &AC_; 7692 GetLAA = &GetLAA_; 7693 DB = &DB_; 7694 ORE = &ORE_; 7695 PSI = PSI_; 7696 7697 // Don't attempt if 7698 // 1. the target claims to have no vector registers, and 7699 // 2. interleaving won't help ILP. 7700 // 7701 // The second condition is necessary because, even if the target has no 7702 // vector registers, loop vectorization may still enable scalar 7703 // interleaving. 7704 if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) 7705 return false; 7706 7707 bool Changed = false; 7708 7709 // The vectorizer requires loops to be in simplified form. 7710 // Since simplification may add new inner loops, it has to run before the 7711 // legality and profitability checks. This means running the loop vectorizer 7712 // will simplify all loops, regardless of whether anything end up being 7713 // vectorized. 7714 for (auto &L : *LI) 7715 Changed |= 7716 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 7717 7718 // Build up a worklist of inner-loops to vectorize. This is necessary as 7719 // the act of vectorizing or partially unrolling a loop creates new loops 7720 // and can invalidate iterators across the loops. 7721 SmallVector<Loop *, 8> Worklist; 7722 7723 for (Loop *L : *LI) 7724 collectSupportedLoops(*L, LI, ORE, Worklist); 7725 7726 LoopsAnalyzed += Worklist.size(); 7727 7728 // Now walk the identified inner loops. 7729 while (!Worklist.empty()) { 7730 Loop *L = Worklist.pop_back_val(); 7731 7732 // For the inner loops we actually process, form LCSSA to simplify the 7733 // transform. 7734 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 7735 7736 Changed |= processLoop(L); 7737 } 7738 7739 // Process each loop nest in the function. 7740 return Changed; 7741 } 7742 7743 PreservedAnalyses LoopVectorizePass::run(Function &F, 7744 FunctionAnalysisManager &AM) { 7745 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 7746 auto &LI = AM.getResult<LoopAnalysis>(F); 7747 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 7748 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 7749 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 7750 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 7751 auto &AA = AM.getResult<AAManager>(F); 7752 auto &AC = AM.getResult<AssumptionAnalysis>(F); 7753 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 7754 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 7755 MemorySSA *MSSA = EnableMSSALoopDependency 7756 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 7757 : nullptr; 7758 7759 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 7760 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 7761 [&](Loop &L) -> const LoopAccessInfo & { 7762 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 7763 return LAM.getResult<LoopAccessAnalysis>(L, AR); 7764 }; 7765 const ModuleAnalysisManager &MAM = 7766 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); 7767 ProfileSummaryInfo *PSI = 7768 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 7769 bool Changed = 7770 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 7771 if (!Changed) 7772 return PreservedAnalyses::all(); 7773 PreservedAnalyses PA; 7774 7775 // We currently do not preserve loopinfo/dominator analyses with outer loop 7776 // vectorization. Until this is addressed, mark these analyses as preserved 7777 // only for non-VPlan-native path. 7778 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 7779 if (!EnableVPlanNativePath) { 7780 PA.preserve<LoopAnalysis>(); 7781 PA.preserve<DominatorTreeAnalysis>(); 7782 } 7783 PA.preserve<BasicAA>(); 7784 PA.preserve<GlobalsAA>(); 7785 return PA; 7786 } 7787