1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpander.h" 95 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 96 #include "llvm/Analysis/TargetLibraryInfo.h" 97 #include "llvm/Analysis/TargetTransformInfo.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfoMetadata.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/LLVMContext.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/InitializePasses.h" 128 #include "llvm/Pass.h" 129 #include "llvm/Support/Casting.h" 130 #include "llvm/Support/CommandLine.h" 131 #include "llvm/Support/Compiler.h" 132 #include "llvm/Support/Debug.h" 133 #include "llvm/Support/ErrorHandling.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/SizeOpts.h" 141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 142 #include <algorithm> 143 #include <cassert> 144 #include <cstdint> 145 #include <cstdlib> 146 #include <functional> 147 #include <iterator> 148 #include <limits> 149 #include <memory> 150 #include <string> 151 #include <tuple> 152 #include <utility> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 /// @{ 160 /// Metadata attribute names 161 static const char *const LLVMLoopVectorizeFollowupAll = 162 "llvm.loop.vectorize.followup_all"; 163 static const char *const LLVMLoopVectorizeFollowupVectorized = 164 "llvm.loop.vectorize.followup_vectorized"; 165 static const char *const LLVMLoopVectorizeFollowupEpilogue = 166 "llvm.loop.vectorize.followup_epilogue"; 167 /// @} 168 169 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 171 172 /// Loops with a known constant trip count below this number are vectorized only 173 /// if no scalar iteration overheads are incurred. 174 static cl::opt<unsigned> TinyTripCountVectorThreshold( 175 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 176 cl::desc("Loops with a constant trip count that is smaller than this " 177 "value are vectorized only if no scalar iteration overheads " 178 "are incurred.")); 179 180 // Indicates that an epilogue is undesired, predication is preferred. 181 // This means that the vectorizer will try to fold the loop-tail (epilogue) 182 // into the loop and predicate the loop body accordingly. 183 static cl::opt<bool> PreferPredicateOverEpilog( 184 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 185 cl::desc("Indicate that an epilogue is undesired, predication should be " 186 "used instead.")); 187 188 static cl::opt<bool> MaximizeBandwidth( 189 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 190 cl::desc("Maximize bandwidth when selecting vectorization factor which " 191 "will be determined by the smallest type in loop.")); 192 193 static cl::opt<bool> EnableInterleavedMemAccesses( 194 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 195 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 196 197 /// An interleave-group may need masking if it resides in a block that needs 198 /// predication, or in order to mask away gaps. 199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 200 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 201 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 202 203 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 204 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 205 cl::desc("We don't interleave loops with a estimated constant trip count " 206 "below this number")); 207 208 static cl::opt<unsigned> ForceTargetNumScalarRegs( 209 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 210 cl::desc("A flag that overrides the target's number of scalar registers.")); 211 212 static cl::opt<unsigned> ForceTargetNumVectorRegs( 213 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 214 cl::desc("A flag that overrides the target's number of vector registers.")); 215 216 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 217 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 218 cl::desc("A flag that overrides the target's max interleave factor for " 219 "scalar loops.")); 220 221 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 222 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 223 cl::desc("A flag that overrides the target's max interleave factor for " 224 "vectorized loops.")); 225 226 static cl::opt<unsigned> ForceTargetInstructionCost( 227 "force-target-instruction-cost", cl::init(0), cl::Hidden, 228 cl::desc("A flag that overrides the target's expected cost for " 229 "an instruction to a single constant value. Mostly " 230 "useful for getting consistent testing.")); 231 232 static cl::opt<unsigned> SmallLoopCost( 233 "small-loop-cost", cl::init(20), cl::Hidden, 234 cl::desc( 235 "The cost of a loop that is considered 'small' by the interleaver.")); 236 237 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 238 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 239 cl::desc("Enable the use of the block frequency analysis to access PGO " 240 "heuristics minimizing code growth in cold regions and being more " 241 "aggressive in hot regions.")); 242 243 // Runtime interleave loops for load/store throughput. 244 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 245 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 246 cl::desc( 247 "Enable runtime interleaving until load/store ports are saturated")); 248 249 /// The number of stores in a loop that are allowed to need predication. 250 static cl::opt<unsigned> NumberOfStoresToPredicate( 251 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 252 cl::desc("Max number of stores to be predicated behind an if.")); 253 254 static cl::opt<bool> EnableIndVarRegisterHeur( 255 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 256 cl::desc("Count the induction variable only once when interleaving")); 257 258 static cl::opt<bool> EnableCondStoresVectorization( 259 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 260 cl::desc("Enable if predication of stores during vectorization.")); 261 262 static cl::opt<unsigned> MaxNestedScalarReductionIC( 263 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 264 cl::desc("The maximum interleave count to use when interleaving a scalar " 265 "reduction in a nested loop.")); 266 267 cl::opt<bool> EnableVPlanNativePath( 268 "enable-vplan-native-path", cl::init(false), cl::Hidden, 269 cl::desc("Enable VPlan-native vectorization path with " 270 "support for outer loop vectorization.")); 271 272 // FIXME: Remove this switch once we have divergence analysis. Currently we 273 // assume divergent non-backedge branches when this switch is true. 274 cl::opt<bool> EnableVPlanPredication( 275 "enable-vplan-predication", cl::init(false), cl::Hidden, 276 cl::desc("Enable VPlan-native vectorization path predicator with " 277 "support for outer loop vectorization.")); 278 279 // This flag enables the stress testing of the VPlan H-CFG construction in the 280 // VPlan-native vectorization path. It must be used in conjuction with 281 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 282 // verification of the H-CFGs built. 283 static cl::opt<bool> VPlanBuildStressTest( 284 "vplan-build-stress-test", cl::init(false), cl::Hidden, 285 cl::desc( 286 "Build VPlan for every supported loop nest in the function and bail " 287 "out right after the build (stress test the VPlan H-CFG construction " 288 "in the VPlan-native vectorization path).")); 289 290 cl::opt<bool> llvm::EnableLoopInterleaving( 291 "interleave-loops", cl::init(true), cl::Hidden, 292 cl::desc("Enable loop interleaving in Loop vectorization passes")); 293 cl::opt<bool> llvm::EnableLoopVectorization( 294 "vectorize-loops", cl::init(true), cl::Hidden, 295 cl::desc("Run the Loop vectorization passes")); 296 297 /// A helper function for converting Scalar types to vector types. 298 /// If the incoming type is void, we return void. If the VF is 1, we return 299 /// the scalar type. 300 static Type *ToVectorTy(Type *Scalar, unsigned VF) { 301 if (Scalar->isVoidTy() || VF == 1) 302 return Scalar; 303 return VectorType::get(Scalar, VF); 304 } 305 306 /// A helper function that returns the type of loaded or stored value. 307 static Type *getMemInstValueType(Value *I) { 308 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 309 "Expected Load or Store instruction"); 310 if (auto *LI = dyn_cast<LoadInst>(I)) 311 return LI->getType(); 312 return cast<StoreInst>(I)->getValueOperand()->getType(); 313 } 314 315 /// A helper function that returns true if the given type is irregular. The 316 /// type is irregular if its allocated size doesn't equal the store size of an 317 /// element of the corresponding vector type at the given vectorization factor. 318 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 319 // Determine if an array of VF elements of type Ty is "bitcast compatible" 320 // with a <VF x Ty> vector. 321 if (VF > 1) { 322 auto *VectorTy = VectorType::get(Ty, VF); 323 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 324 } 325 326 // If the vectorization factor is one, we just check if an array of type Ty 327 // requires padding between elements. 328 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 329 } 330 331 /// A helper function that returns the reciprocal of the block probability of 332 /// predicated blocks. If we return X, we are assuming the predicated block 333 /// will execute once for every X iterations of the loop header. 334 /// 335 /// TODO: We should use actual block probability here, if available. Currently, 336 /// we always assume predicated blocks have a 50% chance of executing. 337 static unsigned getReciprocalPredBlockProb() { return 2; } 338 339 /// A helper function that adds a 'fast' flag to floating-point operations. 340 static Value *addFastMathFlag(Value *V) { 341 if (isa<FPMathOperator>(V)) 342 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 343 return V; 344 } 345 346 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 347 if (isa<FPMathOperator>(V)) 348 cast<Instruction>(V)->setFastMathFlags(FMF); 349 return V; 350 } 351 352 /// A helper function that returns an integer or floating-point constant with 353 /// value C. 354 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 355 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 356 : ConstantFP::get(Ty, C); 357 } 358 359 /// Returns "best known" trip count for the specified loop \p L as defined by 360 /// the following procedure: 361 /// 1) Returns exact trip count if it is known. 362 /// 2) Returns expected trip count according to profile data if any. 363 /// 3) Returns upper bound estimate if it is known. 364 /// 4) Returns None if all of the above failed. 365 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 366 // Check if exact trip count is known. 367 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 368 return ExpectedTC; 369 370 // Check if there is an expected trip count available from profile data. 371 if (LoopVectorizeWithBlockFrequency) 372 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 373 return EstimatedTC; 374 375 // Check if upper bound estimate is known. 376 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 377 return ExpectedTC; 378 379 return None; 380 } 381 382 namespace llvm { 383 384 /// InnerLoopVectorizer vectorizes loops which contain only one basic 385 /// block to a specified vectorization factor (VF). 386 /// This class performs the widening of scalars into vectors, or multiple 387 /// scalars. This class also implements the following features: 388 /// * It inserts an epilogue loop for handling loops that don't have iteration 389 /// counts that are known to be a multiple of the vectorization factor. 390 /// * It handles the code generation for reduction variables. 391 /// * Scalarization (implementation using scalars) of un-vectorizable 392 /// instructions. 393 /// InnerLoopVectorizer does not perform any vectorization-legality 394 /// checks, and relies on the caller to check for the different legality 395 /// aspects. The InnerLoopVectorizer relies on the 396 /// LoopVectorizationLegality class to provide information about the induction 397 /// and reduction variables that were found to a given vectorization factor. 398 class InnerLoopVectorizer { 399 public: 400 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 401 LoopInfo *LI, DominatorTree *DT, 402 const TargetLibraryInfo *TLI, 403 const TargetTransformInfo *TTI, AssumptionCache *AC, 404 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 405 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 406 LoopVectorizationCostModel *CM) 407 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 408 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 409 Builder(PSE.getSE()->getContext()), 410 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 411 virtual ~InnerLoopVectorizer() = default; 412 413 /// Create a new empty loop. Unlink the old loop and connect the new one. 414 /// Return the pre-header block of the new loop. 415 BasicBlock *createVectorizedLoopSkeleton(); 416 417 /// Widen a single instruction within the innermost loop. 418 void widenInstruction(Instruction &I); 419 420 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 421 void fixVectorizedLoop(); 422 423 // Return true if any runtime check is added. 424 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 425 426 /// A type for vectorized values in the new loop. Each value from the 427 /// original loop, when vectorized, is represented by UF vector values in the 428 /// new unrolled loop, where UF is the unroll factor. 429 using VectorParts = SmallVector<Value *, 2>; 430 431 /// Vectorize a single GetElementPtrInst based on information gathered and 432 /// decisions taken during planning. 433 void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, 434 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); 435 436 /// Vectorize a single PHINode in a block. This method handles the induction 437 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 438 /// arbitrary length vectors. 439 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 440 441 /// A helper function to scalarize a single Instruction in the innermost loop. 442 /// Generates a sequence of scalar instances for each lane between \p MinLane 443 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 444 /// inclusive.. 445 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, 446 bool IfPredicateInstr); 447 448 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 449 /// is provided, the integer induction variable will first be truncated to 450 /// the corresponding type. 451 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 452 453 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 454 /// vector or scalar value on-demand if one is not yet available. When 455 /// vectorizing a loop, we visit the definition of an instruction before its 456 /// uses. When visiting the definition, we either vectorize or scalarize the 457 /// instruction, creating an entry for it in the corresponding map. (In some 458 /// cases, such as induction variables, we will create both vector and scalar 459 /// entries.) Then, as we encounter uses of the definition, we derive values 460 /// for each scalar or vector use unless such a value is already available. 461 /// For example, if we scalarize a definition and one of its uses is vector, 462 /// we build the required vector on-demand with an insertelement sequence 463 /// when visiting the use. Otherwise, if the use is scalar, we can use the 464 /// existing scalar definition. 465 /// 466 /// Return a value in the new loop corresponding to \p V from the original 467 /// loop at unroll index \p Part. If the value has already been vectorized, 468 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 469 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 470 /// a new vector value on-demand by inserting the scalar values into a vector 471 /// with an insertelement sequence. If the value has been neither vectorized 472 /// nor scalarized, it must be loop invariant, so we simply broadcast the 473 /// value into a vector. 474 Value *getOrCreateVectorValue(Value *V, unsigned Part); 475 476 /// Return a value in the new loop corresponding to \p V from the original 477 /// loop at unroll and vector indices \p Instance. If the value has been 478 /// vectorized but not scalarized, the necessary extractelement instruction 479 /// will be generated. 480 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 481 482 /// Construct the vector value of a scalarized value \p V one lane at a time. 483 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 484 485 /// Try to vectorize the interleaved access group that \p Instr belongs to, 486 /// optionally masking the vector operations if \p BlockInMask is non-null. 487 void vectorizeInterleaveGroup(Instruction *Instr, 488 VectorParts *BlockInMask = nullptr); 489 490 /// Vectorize Load and Store instructions, optionally masking the vector 491 /// operations if \p BlockInMask is non-null. 492 void vectorizeMemoryInstruction(Instruction *Instr, 493 VectorParts *BlockInMask = nullptr); 494 495 /// Set the debug location in the builder using the debug location in 496 /// the instruction. 497 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 498 499 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 500 void fixNonInductionPHIs(void); 501 502 protected: 503 friend class LoopVectorizationPlanner; 504 505 /// A small list of PHINodes. 506 using PhiVector = SmallVector<PHINode *, 4>; 507 508 /// A type for scalarized values in the new loop. Each value from the 509 /// original loop, when scalarized, is represented by UF x VF scalar values 510 /// in the new unrolled loop, where UF is the unroll factor and VF is the 511 /// vectorization factor. 512 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 513 514 /// Set up the values of the IVs correctly when exiting the vector loop. 515 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 516 Value *CountRoundDown, Value *EndValue, 517 BasicBlock *MiddleBlock); 518 519 /// Create a new induction variable inside L. 520 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 521 Value *Step, Instruction *DL); 522 523 /// Handle all cross-iteration phis in the header. 524 void fixCrossIterationPHIs(); 525 526 /// Fix a first-order recurrence. This is the second phase of vectorizing 527 /// this phi node. 528 void fixFirstOrderRecurrence(PHINode *Phi); 529 530 /// Fix a reduction cross-iteration phi. This is the second phase of 531 /// vectorizing this phi node. 532 void fixReduction(PHINode *Phi); 533 534 /// Clear NSW/NUW flags from reduction instructions if necessary. 535 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 536 537 /// The Loop exit block may have single value PHI nodes with some 538 /// incoming value. While vectorizing we only handled real values 539 /// that were defined inside the loop and we should have one value for 540 /// each predecessor of its parent basic block. See PR14725. 541 void fixLCSSAPHIs(); 542 543 /// Iteratively sink the scalarized operands of a predicated instruction into 544 /// the block that was created for it. 545 void sinkScalarOperands(Instruction *PredInst); 546 547 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 548 /// represented as. 549 void truncateToMinimalBitwidths(); 550 551 /// Create a broadcast instruction. This method generates a broadcast 552 /// instruction (shuffle) for loop invariant values and for the induction 553 /// value. If this is the induction variable then we extend it to N, N+1, ... 554 /// this is needed because each iteration in the loop corresponds to a SIMD 555 /// element. 556 virtual Value *getBroadcastInstrs(Value *V); 557 558 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 559 /// to each vector element of Val. The sequence starts at StartIndex. 560 /// \p Opcode is relevant for FP induction variable. 561 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 562 Instruction::BinaryOps Opcode = 563 Instruction::BinaryOpsEnd); 564 565 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 566 /// variable on which to base the steps, \p Step is the size of the step, and 567 /// \p EntryVal is the value from the original loop that maps to the steps. 568 /// Note that \p EntryVal doesn't have to be an induction variable - it 569 /// can also be a truncate instruction. 570 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 571 const InductionDescriptor &ID); 572 573 /// Create a vector induction phi node based on an existing scalar one. \p 574 /// EntryVal is the value from the original loop that maps to the vector phi 575 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 576 /// truncate instruction, instead of widening the original IV, we widen a 577 /// version of the IV truncated to \p EntryVal's type. 578 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 579 Value *Step, Instruction *EntryVal); 580 581 /// Returns true if an instruction \p I should be scalarized instead of 582 /// vectorized for the chosen vectorization factor. 583 bool shouldScalarizeInstruction(Instruction *I) const; 584 585 /// Returns true if we should generate a scalar version of \p IV. 586 bool needsScalarInduction(Instruction *IV) const; 587 588 /// If there is a cast involved in the induction variable \p ID, which should 589 /// be ignored in the vectorized loop body, this function records the 590 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 591 /// cast. We had already proved that the casted Phi is equal to the uncasted 592 /// Phi in the vectorized loop (under a runtime guard), and therefore 593 /// there is no need to vectorize the cast - the same value can be used in the 594 /// vector loop for both the Phi and the cast. 595 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 596 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 597 /// 598 /// \p EntryVal is the value from the original loop that maps to the vector 599 /// phi node and is used to distinguish what is the IV currently being 600 /// processed - original one (if \p EntryVal is a phi corresponding to the 601 /// original IV) or the "newly-created" one based on the proof mentioned above 602 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 603 /// latter case \p EntryVal is a TruncInst and we must not record anything for 604 /// that IV, but it's error-prone to expect callers of this routine to care 605 /// about that, hence this explicit parameter. 606 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 607 const Instruction *EntryVal, 608 Value *VectorLoopValue, 609 unsigned Part, 610 unsigned Lane = UINT_MAX); 611 612 /// Generate a shuffle sequence that will reverse the vector Vec. 613 virtual Value *reverseVector(Value *Vec); 614 615 /// Returns (and creates if needed) the original loop trip count. 616 Value *getOrCreateTripCount(Loop *NewLoop); 617 618 /// Returns (and creates if needed) the trip count of the widened loop. 619 Value *getOrCreateVectorTripCount(Loop *NewLoop); 620 621 /// Returns a bitcasted value to the requested vector type. 622 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 623 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 624 const DataLayout &DL); 625 626 /// Emit a bypass check to see if the vector trip count is zero, including if 627 /// it overflows. 628 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 629 630 /// Emit a bypass check to see if all of the SCEV assumptions we've 631 /// had to make are correct. 632 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 633 634 /// Emit bypass checks to check any memory assumptions we may have made. 635 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 636 637 /// Compute the transformed value of Index at offset StartValue using step 638 /// StepValue. 639 /// For integer induction, returns StartValue + Index * StepValue. 640 /// For pointer induction, returns StartValue[Index * StepValue]. 641 /// FIXME: The newly created binary instructions should contain nsw/nuw 642 /// flags, which can be found from the original scalar operations. 643 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 644 const DataLayout &DL, 645 const InductionDescriptor &ID) const; 646 647 /// Add additional metadata to \p To that was not present on \p Orig. 648 /// 649 /// Currently this is used to add the noalias annotations based on the 650 /// inserted memchecks. Use this for instructions that are *cloned* into the 651 /// vector loop. 652 void addNewMetadata(Instruction *To, const Instruction *Orig); 653 654 /// Add metadata from one instruction to another. 655 /// 656 /// This includes both the original MDs from \p From and additional ones (\see 657 /// addNewMetadata). Use this for *newly created* instructions in the vector 658 /// loop. 659 void addMetadata(Instruction *To, Instruction *From); 660 661 /// Similar to the previous function but it adds the metadata to a 662 /// vector of instructions. 663 void addMetadata(ArrayRef<Value *> To, Instruction *From); 664 665 /// The original loop. 666 Loop *OrigLoop; 667 668 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 669 /// dynamic knowledge to simplify SCEV expressions and converts them to a 670 /// more usable form. 671 PredicatedScalarEvolution &PSE; 672 673 /// Loop Info. 674 LoopInfo *LI; 675 676 /// Dominator Tree. 677 DominatorTree *DT; 678 679 /// Alias Analysis. 680 AliasAnalysis *AA; 681 682 /// Target Library Info. 683 const TargetLibraryInfo *TLI; 684 685 /// Target Transform Info. 686 const TargetTransformInfo *TTI; 687 688 /// Assumption Cache. 689 AssumptionCache *AC; 690 691 /// Interface to emit optimization remarks. 692 OptimizationRemarkEmitter *ORE; 693 694 /// LoopVersioning. It's only set up (non-null) if memchecks were 695 /// used. 696 /// 697 /// This is currently only used to add no-alias metadata based on the 698 /// memchecks. The actually versioning is performed manually. 699 std::unique_ptr<LoopVersioning> LVer; 700 701 /// The vectorization SIMD factor to use. Each vector will have this many 702 /// vector elements. 703 unsigned VF; 704 705 /// The vectorization unroll factor to use. Each scalar is vectorized to this 706 /// many different vector instructions. 707 unsigned UF; 708 709 /// The builder that we use 710 IRBuilder<> Builder; 711 712 // --- Vectorization state --- 713 714 /// The vector-loop preheader. 715 BasicBlock *LoopVectorPreHeader; 716 717 /// The scalar-loop preheader. 718 BasicBlock *LoopScalarPreHeader; 719 720 /// Middle Block between the vector and the scalar. 721 BasicBlock *LoopMiddleBlock; 722 723 /// The ExitBlock of the scalar loop. 724 BasicBlock *LoopExitBlock; 725 726 /// The vector loop body. 727 BasicBlock *LoopVectorBody; 728 729 /// The scalar loop body. 730 BasicBlock *LoopScalarBody; 731 732 /// A list of all bypass blocks. The first block is the entry of the loop. 733 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 734 735 /// The new Induction variable which was added to the new block. 736 PHINode *Induction = nullptr; 737 738 /// The induction variable of the old basic block. 739 PHINode *OldInduction = nullptr; 740 741 /// Maps values from the original loop to their corresponding values in the 742 /// vectorized loop. A key value can map to either vector values, scalar 743 /// values or both kinds of values, depending on whether the key was 744 /// vectorized and scalarized. 745 VectorizerValueMap VectorLoopValueMap; 746 747 /// Store instructions that were predicated. 748 SmallVector<Instruction *, 4> PredicatedInstructions; 749 750 /// Trip count of the original loop. 751 Value *TripCount = nullptr; 752 753 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 754 Value *VectorTripCount = nullptr; 755 756 /// The legality analysis. 757 LoopVectorizationLegality *Legal; 758 759 /// The profitablity analysis. 760 LoopVectorizationCostModel *Cost; 761 762 // Record whether runtime checks are added. 763 bool AddedSafetyChecks = false; 764 765 // Holds the end values for each induction variable. We save the end values 766 // so we can later fix-up the external users of the induction variables. 767 DenseMap<PHINode *, Value *> IVEndValues; 768 769 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 770 // fixed up at the end of vector code generation. 771 SmallVector<PHINode *, 8> OrigPHIsToFix; 772 }; 773 774 class InnerLoopUnroller : public InnerLoopVectorizer { 775 public: 776 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 777 LoopInfo *LI, DominatorTree *DT, 778 const TargetLibraryInfo *TLI, 779 const TargetTransformInfo *TTI, AssumptionCache *AC, 780 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 781 LoopVectorizationLegality *LVL, 782 LoopVectorizationCostModel *CM) 783 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 784 UnrollFactor, LVL, CM) {} 785 786 private: 787 Value *getBroadcastInstrs(Value *V) override; 788 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 789 Instruction::BinaryOps Opcode = 790 Instruction::BinaryOpsEnd) override; 791 Value *reverseVector(Value *Vec) override; 792 }; 793 794 } // end namespace llvm 795 796 /// Look for a meaningful debug location on the instruction or it's 797 /// operands. 798 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 799 if (!I) 800 return I; 801 802 DebugLoc Empty; 803 if (I->getDebugLoc() != Empty) 804 return I; 805 806 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 807 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 808 if (OpInst->getDebugLoc() != Empty) 809 return OpInst; 810 } 811 812 return I; 813 } 814 815 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 816 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 817 const DILocation *DIL = Inst->getDebugLoc(); 818 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 819 !isa<DbgInfoIntrinsic>(Inst)) { 820 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 821 if (NewDIL) 822 B.SetCurrentDebugLocation(NewDIL.getValue()); 823 else 824 LLVM_DEBUG(dbgs() 825 << "Failed to create new discriminator: " 826 << DIL->getFilename() << " Line: " << DIL->getLine()); 827 } 828 else 829 B.SetCurrentDebugLocation(DIL); 830 } else 831 B.SetCurrentDebugLocation(DebugLoc()); 832 } 833 834 /// Write a record \p DebugMsg about vectorization failure to the debug 835 /// output stream. If \p I is passed, it is an instruction that prevents 836 /// vectorization. 837 #ifndef NDEBUG 838 static void debugVectorizationFailure(const StringRef DebugMsg, 839 Instruction *I) { 840 dbgs() << "LV: Not vectorizing: " << DebugMsg; 841 if (I != nullptr) 842 dbgs() << " " << *I; 843 else 844 dbgs() << '.'; 845 dbgs() << '\n'; 846 } 847 #endif 848 849 /// Create an analysis remark that explains why vectorization failed 850 /// 851 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 852 /// RemarkName is the identifier for the remark. If \p I is passed it is an 853 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 854 /// the location of the remark. \return the remark object that can be 855 /// streamed to. 856 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 857 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 858 Value *CodeRegion = TheLoop->getHeader(); 859 DebugLoc DL = TheLoop->getStartLoc(); 860 861 if (I) { 862 CodeRegion = I->getParent(); 863 // If there is no debug location attached to the instruction, revert back to 864 // using the loop's. 865 if (I->getDebugLoc()) 866 DL = I->getDebugLoc(); 867 } 868 869 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 870 R << "loop not vectorized: "; 871 return R; 872 } 873 874 namespace llvm { 875 876 void reportVectorizationFailure(const StringRef DebugMsg, 877 const StringRef OREMsg, const StringRef ORETag, 878 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 879 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 880 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 881 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 882 ORETag, TheLoop, I) << OREMsg); 883 } 884 885 } // end namespace llvm 886 887 #ifndef NDEBUG 888 /// \return string containing a file name and a line # for the given loop. 889 static std::string getDebugLocString(const Loop *L) { 890 std::string Result; 891 if (L) { 892 raw_string_ostream OS(Result); 893 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 894 LoopDbgLoc.print(OS); 895 else 896 // Just print the module name. 897 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 898 OS.flush(); 899 } 900 return Result; 901 } 902 #endif 903 904 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 905 const Instruction *Orig) { 906 // If the loop was versioned with memchecks, add the corresponding no-alias 907 // metadata. 908 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 909 LVer->annotateInstWithNoAlias(To, Orig); 910 } 911 912 void InnerLoopVectorizer::addMetadata(Instruction *To, 913 Instruction *From) { 914 propagateMetadata(To, From); 915 addNewMetadata(To, From); 916 } 917 918 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 919 Instruction *From) { 920 for (Value *V : To) { 921 if (Instruction *I = dyn_cast<Instruction>(V)) 922 addMetadata(I, From); 923 } 924 } 925 926 namespace llvm { 927 928 // Loop vectorization cost-model hints how the scalar epilogue loop should be 929 // lowered. 930 enum ScalarEpilogueLowering { 931 932 // The default: allowing scalar epilogues. 933 CM_ScalarEpilogueAllowed, 934 935 // Vectorization with OptForSize: don't allow epilogues. 936 CM_ScalarEpilogueNotAllowedOptSize, 937 938 // A special case of vectorisation with OptForSize: loops with a very small 939 // trip count are considered for vectorization under OptForSize, thereby 940 // making sure the cost of their loop body is dominant, free of runtime 941 // guards and scalar iteration overheads. 942 CM_ScalarEpilogueNotAllowedLowTripLoop, 943 944 // Loop hint predicate indicating an epilogue is undesired. 945 CM_ScalarEpilogueNotNeededUsePredicate 946 }; 947 948 /// LoopVectorizationCostModel - estimates the expected speedups due to 949 /// vectorization. 950 /// In many cases vectorization is not profitable. This can happen because of 951 /// a number of reasons. In this class we mainly attempt to predict the 952 /// expected speedup/slowdowns due to the supported instruction set. We use the 953 /// TargetTransformInfo to query the different backends for the cost of 954 /// different operations. 955 class LoopVectorizationCostModel { 956 public: 957 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 958 PredicatedScalarEvolution &PSE, LoopInfo *LI, 959 LoopVectorizationLegality *Legal, 960 const TargetTransformInfo &TTI, 961 const TargetLibraryInfo *TLI, DemandedBits *DB, 962 AssumptionCache *AC, 963 OptimizationRemarkEmitter *ORE, const Function *F, 964 const LoopVectorizeHints *Hints, 965 InterleavedAccessInfo &IAI) 966 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 967 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 968 Hints(Hints), InterleaveInfo(IAI) {} 969 970 /// \return An upper bound for the vectorization factor, or None if 971 /// vectorization and interleaving should be avoided up front. 972 Optional<unsigned> computeMaxVF(); 973 974 /// \return True if runtime checks are required for vectorization, and false 975 /// otherwise. 976 bool runtimeChecksRequired(); 977 978 /// \return The most profitable vectorization factor and the cost of that VF. 979 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 980 /// then this vectorization factor will be selected if vectorization is 981 /// possible. 982 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 983 984 /// Setup cost-based decisions for user vectorization factor. 985 void selectUserVectorizationFactor(unsigned UserVF) { 986 collectUniformsAndScalars(UserVF); 987 collectInstsToScalarize(UserVF); 988 } 989 990 /// \return The size (in bits) of the smallest and widest types in the code 991 /// that needs to be vectorized. We ignore values that remain scalar such as 992 /// 64 bit loop indices. 993 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 994 995 /// \return The desired interleave count. 996 /// If interleave count has been specified by metadata it will be returned. 997 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 998 /// are the selected vectorization factor and the cost of the selected VF. 999 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 1000 1001 /// Memory access instruction may be vectorized in more than one way. 1002 /// Form of instruction after vectorization depends on cost. 1003 /// This function takes cost-based decisions for Load/Store instructions 1004 /// and collects them in a map. This decisions map is used for building 1005 /// the lists of loop-uniform and loop-scalar instructions. 1006 /// The calculated cost is saved with widening decision in order to 1007 /// avoid redundant calculations. 1008 void setCostBasedWideningDecision(unsigned VF); 1009 1010 /// A struct that represents some properties of the register usage 1011 /// of a loop. 1012 struct RegisterUsage { 1013 /// Holds the number of loop invariant values that are used in the loop. 1014 /// The key is ClassID of target-provided register class. 1015 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1016 /// Holds the maximum number of concurrent live intervals in the loop. 1017 /// The key is ClassID of target-provided register class. 1018 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1019 }; 1020 1021 /// \return Returns information about the register usages of the loop for the 1022 /// given vectorization factors. 1023 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1024 1025 /// Collect values we want to ignore in the cost model. 1026 void collectValuesToIgnore(); 1027 1028 /// \returns The smallest bitwidth each instruction can be represented with. 1029 /// The vector equivalents of these instructions should be truncated to this 1030 /// type. 1031 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1032 return MinBWs; 1033 } 1034 1035 /// \returns True if it is more profitable to scalarize instruction \p I for 1036 /// vectorization factor \p VF. 1037 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1038 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1039 1040 // Cost model is not run in the VPlan-native path - return conservative 1041 // result until this changes. 1042 if (EnableVPlanNativePath) 1043 return false; 1044 1045 auto Scalars = InstsToScalarize.find(VF); 1046 assert(Scalars != InstsToScalarize.end() && 1047 "VF not yet analyzed for scalarization profitability"); 1048 return Scalars->second.find(I) != Scalars->second.end(); 1049 } 1050 1051 /// Returns true if \p I is known to be uniform after vectorization. 1052 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1053 if (VF == 1) 1054 return true; 1055 1056 // Cost model is not run in the VPlan-native path - return conservative 1057 // result until this changes. 1058 if (EnableVPlanNativePath) 1059 return false; 1060 1061 auto UniformsPerVF = Uniforms.find(VF); 1062 assert(UniformsPerVF != Uniforms.end() && 1063 "VF not yet analyzed for uniformity"); 1064 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); 1065 } 1066 1067 /// Returns true if \p I is known to be scalar after vectorization. 1068 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1069 if (VF == 1) 1070 return true; 1071 1072 // Cost model is not run in the VPlan-native path - return conservative 1073 // result until this changes. 1074 if (EnableVPlanNativePath) 1075 return false; 1076 1077 auto ScalarsPerVF = Scalars.find(VF); 1078 assert(ScalarsPerVF != Scalars.end() && 1079 "Scalar values are not calculated for VF"); 1080 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); 1081 } 1082 1083 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1084 /// for vectorization factor \p VF. 1085 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1086 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1087 !isProfitableToScalarize(I, VF) && 1088 !isScalarAfterVectorization(I, VF); 1089 } 1090 1091 /// Decision that was taken during cost calculation for memory instruction. 1092 enum InstWidening { 1093 CM_Unknown, 1094 CM_Widen, // For consecutive accesses with stride +1. 1095 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1096 CM_Interleave, 1097 CM_GatherScatter, 1098 CM_Scalarize 1099 }; 1100 1101 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1102 /// instruction \p I and vector width \p VF. 1103 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1104 unsigned Cost) { 1105 assert(VF >= 2 && "Expected VF >=2"); 1106 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1107 } 1108 1109 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1110 /// interleaving group \p Grp and vector width \p VF. 1111 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1112 InstWidening W, unsigned Cost) { 1113 assert(VF >= 2 && "Expected VF >=2"); 1114 /// Broadcast this decicion to all instructions inside the group. 1115 /// But the cost will be assigned to one instruction only. 1116 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1117 if (auto *I = Grp->getMember(i)) { 1118 if (Grp->getInsertPos() == I) 1119 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1120 else 1121 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1122 } 1123 } 1124 } 1125 1126 /// Return the cost model decision for the given instruction \p I and vector 1127 /// width \p VF. Return CM_Unknown if this instruction did not pass 1128 /// through the cost modeling. 1129 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1130 assert(VF >= 2 && "Expected VF >=2"); 1131 1132 // Cost model is not run in the VPlan-native path - return conservative 1133 // result until this changes. 1134 if (EnableVPlanNativePath) 1135 return CM_GatherScatter; 1136 1137 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1138 auto Itr = WideningDecisions.find(InstOnVF); 1139 if (Itr == WideningDecisions.end()) 1140 return CM_Unknown; 1141 return Itr->second.first; 1142 } 1143 1144 /// Return the vectorization cost for the given instruction \p I and vector 1145 /// width \p VF. 1146 unsigned getWideningCost(Instruction *I, unsigned VF) { 1147 assert(VF >= 2 && "Expected VF >=2"); 1148 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1149 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1150 "The cost is not calculated"); 1151 return WideningDecisions[InstOnVF].second; 1152 } 1153 1154 /// Return True if instruction \p I is an optimizable truncate whose operand 1155 /// is an induction variable. Such a truncate will be removed by adding a new 1156 /// induction variable with the destination type. 1157 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1158 // If the instruction is not a truncate, return false. 1159 auto *Trunc = dyn_cast<TruncInst>(I); 1160 if (!Trunc) 1161 return false; 1162 1163 // Get the source and destination types of the truncate. 1164 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1165 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1166 1167 // If the truncate is free for the given types, return false. Replacing a 1168 // free truncate with an induction variable would add an induction variable 1169 // update instruction to each iteration of the loop. We exclude from this 1170 // check the primary induction variable since it will need an update 1171 // instruction regardless. 1172 Value *Op = Trunc->getOperand(0); 1173 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1174 return false; 1175 1176 // If the truncated value is not an induction variable, return false. 1177 return Legal->isInductionPhi(Op); 1178 } 1179 1180 /// Collects the instructions to scalarize for each predicated instruction in 1181 /// the loop. 1182 void collectInstsToScalarize(unsigned VF); 1183 1184 /// Collect Uniform and Scalar values for the given \p VF. 1185 /// The sets depend on CM decision for Load/Store instructions 1186 /// that may be vectorized as interleave, gather-scatter or scalarized. 1187 void collectUniformsAndScalars(unsigned VF) { 1188 // Do the analysis once. 1189 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1190 return; 1191 setCostBasedWideningDecision(VF); 1192 collectLoopUniforms(VF); 1193 collectLoopScalars(VF); 1194 } 1195 1196 /// Returns true if the target machine supports masked store operation 1197 /// for the given \p DataType and kind of access to \p Ptr. 1198 bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1199 return Legal->isConsecutivePtr(Ptr) && 1200 TTI.isLegalMaskedStore(DataType, Alignment); 1201 } 1202 1203 /// Returns true if the target machine supports masked load operation 1204 /// for the given \p DataType and kind of access to \p Ptr. 1205 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1206 return Legal->isConsecutivePtr(Ptr) && 1207 TTI.isLegalMaskedLoad(DataType, Alignment); 1208 } 1209 1210 /// Returns true if the target machine supports masked scatter operation 1211 /// for the given \p DataType. 1212 bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) { 1213 return TTI.isLegalMaskedScatter(DataType, Alignment); 1214 } 1215 1216 /// Returns true if the target machine supports masked gather operation 1217 /// for the given \p DataType. 1218 bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) { 1219 return TTI.isLegalMaskedGather(DataType, Alignment); 1220 } 1221 1222 /// Returns true if the target machine can represent \p V as a masked gather 1223 /// or scatter operation. 1224 bool isLegalGatherOrScatter(Value *V) { 1225 bool LI = isa<LoadInst>(V); 1226 bool SI = isa<StoreInst>(V); 1227 if (!LI && !SI) 1228 return false; 1229 auto *Ty = getMemInstValueType(V); 1230 MaybeAlign Align = getLoadStoreAlignment(V); 1231 return (LI && isLegalMaskedGather(Ty, Align)) || 1232 (SI && isLegalMaskedScatter(Ty, Align)); 1233 } 1234 1235 /// Returns true if \p I is an instruction that will be scalarized with 1236 /// predication. Such instructions include conditional stores and 1237 /// instructions that may divide by zero. 1238 /// If a non-zero VF has been calculated, we check if I will be scalarized 1239 /// predication for that VF. 1240 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1241 1242 // Returns true if \p I is an instruction that will be predicated either 1243 // through scalar predication or masked load/store or masked gather/scatter. 1244 // Superset of instructions that return true for isScalarWithPredication. 1245 bool isPredicatedInst(Instruction *I) { 1246 if (!blockNeedsPredication(I->getParent())) 1247 return false; 1248 // Loads and stores that need some form of masked operation are predicated 1249 // instructions. 1250 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1251 return Legal->isMaskRequired(I); 1252 return isScalarWithPredication(I); 1253 } 1254 1255 /// Returns true if \p I is a memory instruction with consecutive memory 1256 /// access that can be widened. 1257 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1258 1259 /// Returns true if \p I is a memory instruction in an interleaved-group 1260 /// of memory accesses that can be vectorized with wide vector loads/stores 1261 /// and shuffles. 1262 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1263 1264 /// Check if \p Instr belongs to any interleaved access group. 1265 bool isAccessInterleaved(Instruction *Instr) { 1266 return InterleaveInfo.isInterleaved(Instr); 1267 } 1268 1269 /// Get the interleaved access group that \p Instr belongs to. 1270 const InterleaveGroup<Instruction> * 1271 getInterleavedAccessGroup(Instruction *Instr) { 1272 return InterleaveInfo.getInterleaveGroup(Instr); 1273 } 1274 1275 /// Returns true if an interleaved group requires a scalar iteration 1276 /// to handle accesses with gaps, and there is nothing preventing us from 1277 /// creating a scalar epilogue. 1278 bool requiresScalarEpilogue() const { 1279 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1280 } 1281 1282 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1283 /// loop hint annotation. 1284 bool isScalarEpilogueAllowed() const { 1285 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1286 } 1287 1288 /// Returns true if all loop blocks should be masked to fold tail loop. 1289 bool foldTailByMasking() const { return FoldTailByMasking; } 1290 1291 bool blockNeedsPredication(BasicBlock *BB) { 1292 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1293 } 1294 1295 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1296 /// with factor VF. Return the cost of the instruction, including 1297 /// scalarization overhead if it's needed. 1298 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1299 1300 /// Estimate cost of a call instruction CI if it were vectorized with factor 1301 /// VF. Return the cost of the instruction, including scalarization overhead 1302 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1303 /// scalarized - 1304 /// i.e. either vector version isn't available, or is too expensive. 1305 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1306 1307 private: 1308 unsigned NumPredStores = 0; 1309 1310 /// \return An upper bound for the vectorization factor, larger than zero. 1311 /// One is returned if vectorization should best be avoided due to cost. 1312 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1313 1314 /// The vectorization cost is a combination of the cost itself and a boolean 1315 /// indicating whether any of the contributing operations will actually 1316 /// operate on 1317 /// vector values after type legalization in the backend. If this latter value 1318 /// is 1319 /// false, then all operations will be scalarized (i.e. no vectorization has 1320 /// actually taken place). 1321 using VectorizationCostTy = std::pair<unsigned, bool>; 1322 1323 /// Returns the expected execution cost. The unit of the cost does 1324 /// not matter because we use the 'cost' units to compare different 1325 /// vector widths. The cost that is returned is *not* normalized by 1326 /// the factor width. 1327 VectorizationCostTy expectedCost(unsigned VF); 1328 1329 /// Returns the execution time cost of an instruction for a given vector 1330 /// width. Vector width of one means scalar. 1331 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1332 1333 /// The cost-computation logic from getInstructionCost which provides 1334 /// the vector type as an output parameter. 1335 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1336 1337 /// Calculate vectorization cost of memory instruction \p I. 1338 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1339 1340 /// The cost computation for scalarized memory instruction. 1341 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1342 1343 /// The cost computation for interleaving group of memory instructions. 1344 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1345 1346 /// The cost computation for Gather/Scatter instruction. 1347 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1348 1349 /// The cost computation for widening instruction \p I with consecutive 1350 /// memory access. 1351 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1352 1353 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1354 /// Load: scalar load + broadcast. 1355 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1356 /// element) 1357 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1358 1359 /// Estimate the overhead of scalarizing an instruction. This is a 1360 /// convenience wrapper for the type-based getScalarizationOverhead API. 1361 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1362 1363 /// Returns whether the instruction is a load or store and will be a emitted 1364 /// as a vector operation. 1365 bool isConsecutiveLoadOrStore(Instruction *I); 1366 1367 /// Returns true if an artificially high cost for emulated masked memrefs 1368 /// should be used. 1369 bool useEmulatedMaskMemRefHack(Instruction *I); 1370 1371 /// Map of scalar integer values to the smallest bitwidth they can be legally 1372 /// represented as. The vector equivalents of these values should be truncated 1373 /// to this type. 1374 MapVector<Instruction *, uint64_t> MinBWs; 1375 1376 /// A type representing the costs for instructions if they were to be 1377 /// scalarized rather than vectorized. The entries are Instruction-Cost 1378 /// pairs. 1379 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1380 1381 /// A set containing all BasicBlocks that are known to present after 1382 /// vectorization as a predicated block. 1383 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1384 1385 /// Records whether it is allowed to have the original scalar loop execute at 1386 /// least once. This may be needed as a fallback loop in case runtime 1387 /// aliasing/dependence checks fail, or to handle the tail/remainder 1388 /// iterations when the trip count is unknown or doesn't divide by the VF, 1389 /// or as a peel-loop to handle gaps in interleave-groups. 1390 /// Under optsize and when the trip count is very small we don't allow any 1391 /// iterations to execute in the scalar loop. 1392 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1393 1394 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1395 bool FoldTailByMasking = false; 1396 1397 /// A map holding scalar costs for different vectorization factors. The 1398 /// presence of a cost for an instruction in the mapping indicates that the 1399 /// instruction will be scalarized when vectorizing with the associated 1400 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1401 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1402 1403 /// Holds the instructions known to be uniform after vectorization. 1404 /// The data is collected per VF. 1405 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1406 1407 /// Holds the instructions known to be scalar after vectorization. 1408 /// The data is collected per VF. 1409 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1410 1411 /// Holds the instructions (address computations) that are forced to be 1412 /// scalarized. 1413 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1414 1415 /// Returns the expected difference in cost from scalarizing the expression 1416 /// feeding a predicated instruction \p PredInst. The instructions to 1417 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1418 /// non-negative return value implies the expression will be scalarized. 1419 /// Currently, only single-use chains are considered for scalarization. 1420 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1421 unsigned VF); 1422 1423 /// Collect the instructions that are uniform after vectorization. An 1424 /// instruction is uniform if we represent it with a single scalar value in 1425 /// the vectorized loop corresponding to each vector iteration. Examples of 1426 /// uniform instructions include pointer operands of consecutive or 1427 /// interleaved memory accesses. Note that although uniformity implies an 1428 /// instruction will be scalar, the reverse is not true. In general, a 1429 /// scalarized instruction will be represented by VF scalar values in the 1430 /// vectorized loop, each corresponding to an iteration of the original 1431 /// scalar loop. 1432 void collectLoopUniforms(unsigned VF); 1433 1434 /// Collect the instructions that are scalar after vectorization. An 1435 /// instruction is scalar if it is known to be uniform or will be scalarized 1436 /// during vectorization. Non-uniform scalarized instructions will be 1437 /// represented by VF values in the vectorized loop, each corresponding to an 1438 /// iteration of the original scalar loop. 1439 void collectLoopScalars(unsigned VF); 1440 1441 /// Keeps cost model vectorization decision and cost for instructions. 1442 /// Right now it is used for memory instructions only. 1443 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1444 std::pair<InstWidening, unsigned>>; 1445 1446 DecisionList WideningDecisions; 1447 1448 /// Returns true if \p V is expected to be vectorized and it needs to be 1449 /// extracted. 1450 bool needsExtract(Value *V, unsigned VF) const { 1451 Instruction *I = dyn_cast<Instruction>(V); 1452 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1453 return false; 1454 1455 // Assume we can vectorize V (and hence we need extraction) if the 1456 // scalars are not computed yet. This can happen, because it is called 1457 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1458 // the scalars are collected. That should be a safe assumption in most 1459 // cases, because we check if the operands have vectorizable types 1460 // beforehand in LoopVectorizationLegality. 1461 return Scalars.find(VF) == Scalars.end() || 1462 !isScalarAfterVectorization(I, VF); 1463 }; 1464 1465 /// Returns a range containing only operands needing to be extracted. 1466 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1467 unsigned VF) { 1468 return SmallVector<Value *, 4>(make_filter_range( 1469 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1470 } 1471 1472 public: 1473 /// The loop that we evaluate. 1474 Loop *TheLoop; 1475 1476 /// Predicated scalar evolution analysis. 1477 PredicatedScalarEvolution &PSE; 1478 1479 /// Loop Info analysis. 1480 LoopInfo *LI; 1481 1482 /// Vectorization legality. 1483 LoopVectorizationLegality *Legal; 1484 1485 /// Vector target information. 1486 const TargetTransformInfo &TTI; 1487 1488 /// Target Library Info. 1489 const TargetLibraryInfo *TLI; 1490 1491 /// Demanded bits analysis. 1492 DemandedBits *DB; 1493 1494 /// Assumption cache. 1495 AssumptionCache *AC; 1496 1497 /// Interface to emit optimization remarks. 1498 OptimizationRemarkEmitter *ORE; 1499 1500 const Function *TheFunction; 1501 1502 /// Loop Vectorize Hint. 1503 const LoopVectorizeHints *Hints; 1504 1505 /// The interleave access information contains groups of interleaved accesses 1506 /// with the same stride and close to each other. 1507 InterleavedAccessInfo &InterleaveInfo; 1508 1509 /// Values to ignore in the cost model. 1510 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1511 1512 /// Values to ignore in the cost model when VF > 1. 1513 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1514 }; 1515 1516 } // end namespace llvm 1517 1518 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1519 // vectorization. The loop needs to be annotated with #pragma omp simd 1520 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1521 // vector length information is not provided, vectorization is not considered 1522 // explicit. Interleave hints are not allowed either. These limitations will be 1523 // relaxed in the future. 1524 // Please, note that we are currently forced to abuse the pragma 'clang 1525 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1526 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1527 // provides *explicit vectorization hints* (LV can bypass legal checks and 1528 // assume that vectorization is legal). However, both hints are implemented 1529 // using the same metadata (llvm.loop.vectorize, processed by 1530 // LoopVectorizeHints). This will be fixed in the future when the native IR 1531 // representation for pragma 'omp simd' is introduced. 1532 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1533 OptimizationRemarkEmitter *ORE) { 1534 assert(!OuterLp->empty() && "This is not an outer loop"); 1535 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1536 1537 // Only outer loops with an explicit vectorization hint are supported. 1538 // Unannotated outer loops are ignored. 1539 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1540 return false; 1541 1542 Function *Fn = OuterLp->getHeader()->getParent(); 1543 if (!Hints.allowVectorization(Fn, OuterLp, 1544 true /*VectorizeOnlyWhenForced*/)) { 1545 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1546 return false; 1547 } 1548 1549 if (Hints.getInterleave() > 1) { 1550 // TODO: Interleave support is future work. 1551 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1552 "outer loops.\n"); 1553 Hints.emitRemarkWithHints(); 1554 return false; 1555 } 1556 1557 return true; 1558 } 1559 1560 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1561 OptimizationRemarkEmitter *ORE, 1562 SmallVectorImpl<Loop *> &V) { 1563 // Collect inner loops and outer loops without irreducible control flow. For 1564 // now, only collect outer loops that have explicit vectorization hints. If we 1565 // are stress testing the VPlan H-CFG construction, we collect the outermost 1566 // loop of every loop nest. 1567 if (L.empty() || VPlanBuildStressTest || 1568 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1569 LoopBlocksRPO RPOT(&L); 1570 RPOT.perform(LI); 1571 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1572 V.push_back(&L); 1573 // TODO: Collect inner loops inside marked outer loops in case 1574 // vectorization fails for the outer loop. Do not invoke 1575 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1576 // already known to be reducible. We can use an inherited attribute for 1577 // that. 1578 return; 1579 } 1580 } 1581 for (Loop *InnerL : L) 1582 collectSupportedLoops(*InnerL, LI, ORE, V); 1583 } 1584 1585 namespace { 1586 1587 /// The LoopVectorize Pass. 1588 struct LoopVectorize : public FunctionPass { 1589 /// Pass identification, replacement for typeid 1590 static char ID; 1591 1592 LoopVectorizePass Impl; 1593 1594 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1595 bool VectorizeOnlyWhenForced = false) 1596 : FunctionPass(ID) { 1597 Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; 1598 Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; 1599 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1600 } 1601 1602 bool runOnFunction(Function &F) override { 1603 if (skipFunction(F)) 1604 return false; 1605 1606 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1607 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1608 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1609 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1610 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1611 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1612 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1613 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1614 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1615 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1616 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1617 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1618 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1619 1620 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1621 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1622 1623 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1624 GetLAA, *ORE, PSI); 1625 } 1626 1627 void getAnalysisUsage(AnalysisUsage &AU) const override { 1628 AU.addRequired<AssumptionCacheTracker>(); 1629 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1630 AU.addRequired<DominatorTreeWrapperPass>(); 1631 AU.addRequired<LoopInfoWrapperPass>(); 1632 AU.addRequired<ScalarEvolutionWrapperPass>(); 1633 AU.addRequired<TargetTransformInfoWrapperPass>(); 1634 AU.addRequired<AAResultsWrapperPass>(); 1635 AU.addRequired<LoopAccessLegacyAnalysis>(); 1636 AU.addRequired<DemandedBitsWrapperPass>(); 1637 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1638 1639 // We currently do not preserve loopinfo/dominator analyses with outer loop 1640 // vectorization. Until this is addressed, mark these analyses as preserved 1641 // only for non-VPlan-native path. 1642 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1643 if (!EnableVPlanNativePath) { 1644 AU.addPreserved<LoopInfoWrapperPass>(); 1645 AU.addPreserved<DominatorTreeWrapperPass>(); 1646 } 1647 1648 AU.addPreserved<BasicAAWrapperPass>(); 1649 AU.addPreserved<GlobalsAAWrapperPass>(); 1650 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1651 } 1652 }; 1653 1654 } // end anonymous namespace 1655 1656 //===----------------------------------------------------------------------===// 1657 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1658 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1659 //===----------------------------------------------------------------------===// 1660 1661 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1662 // We need to place the broadcast of invariant variables outside the loop, 1663 // but only if it's proven safe to do so. Else, broadcast will be inside 1664 // vector loop body. 1665 Instruction *Instr = dyn_cast<Instruction>(V); 1666 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1667 (!Instr || 1668 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1669 // Place the code for broadcasting invariant variables in the new preheader. 1670 IRBuilder<>::InsertPointGuard Guard(Builder); 1671 if (SafeToHoist) 1672 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1673 1674 // Broadcast the scalar into all locations in the vector. 1675 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1676 1677 return Shuf; 1678 } 1679 1680 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1681 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1682 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1683 "Expected either an induction phi-node or a truncate of it!"); 1684 Value *Start = II.getStartValue(); 1685 1686 // Construct the initial value of the vector IV in the vector loop preheader 1687 auto CurrIP = Builder.saveIP(); 1688 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1689 if (isa<TruncInst>(EntryVal)) { 1690 assert(Start->getType()->isIntegerTy() && 1691 "Truncation requires an integer type"); 1692 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1693 Step = Builder.CreateTrunc(Step, TruncType); 1694 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1695 } 1696 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1697 Value *SteppedStart = 1698 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1699 1700 // We create vector phi nodes for both integer and floating-point induction 1701 // variables. Here, we determine the kind of arithmetic we will perform. 1702 Instruction::BinaryOps AddOp; 1703 Instruction::BinaryOps MulOp; 1704 if (Step->getType()->isIntegerTy()) { 1705 AddOp = Instruction::Add; 1706 MulOp = Instruction::Mul; 1707 } else { 1708 AddOp = II.getInductionOpcode(); 1709 MulOp = Instruction::FMul; 1710 } 1711 1712 // Multiply the vectorization factor by the step using integer or 1713 // floating-point arithmetic as appropriate. 1714 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1715 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1716 1717 // Create a vector splat to use in the induction update. 1718 // 1719 // FIXME: If the step is non-constant, we create the vector splat with 1720 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1721 // handle a constant vector splat. 1722 Value *SplatVF = isa<Constant>(Mul) 1723 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 1724 : Builder.CreateVectorSplat(VF, Mul); 1725 Builder.restoreIP(CurrIP); 1726 1727 // We may need to add the step a number of times, depending on the unroll 1728 // factor. The last of those goes into the PHI. 1729 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1730 &*LoopVectorBody->getFirstInsertionPt()); 1731 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1732 Instruction *LastInduction = VecInd; 1733 for (unsigned Part = 0; Part < UF; ++Part) { 1734 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1735 1736 if (isa<TruncInst>(EntryVal)) 1737 addMetadata(LastInduction, EntryVal); 1738 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1739 1740 LastInduction = cast<Instruction>(addFastMathFlag( 1741 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1742 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1743 } 1744 1745 // Move the last step to the end of the latch block. This ensures consistent 1746 // placement of all induction updates. 1747 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1748 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1749 auto *ICmp = cast<Instruction>(Br->getCondition()); 1750 LastInduction->moveBefore(ICmp); 1751 LastInduction->setName("vec.ind.next"); 1752 1753 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1754 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1755 } 1756 1757 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1758 return Cost->isScalarAfterVectorization(I, VF) || 1759 Cost->isProfitableToScalarize(I, VF); 1760 } 1761 1762 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1763 if (shouldScalarizeInstruction(IV)) 1764 return true; 1765 auto isScalarInst = [&](User *U) -> bool { 1766 auto *I = cast<Instruction>(U); 1767 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1768 }; 1769 return llvm::any_of(IV->users(), isScalarInst); 1770 } 1771 1772 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1773 const InductionDescriptor &ID, const Instruction *EntryVal, 1774 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1775 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1776 "Expected either an induction phi-node or a truncate of it!"); 1777 1778 // This induction variable is not the phi from the original loop but the 1779 // newly-created IV based on the proof that casted Phi is equal to the 1780 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1781 // re-uses the same InductionDescriptor that original IV uses but we don't 1782 // have to do any recording in this case - that is done when original IV is 1783 // processed. 1784 if (isa<TruncInst>(EntryVal)) 1785 return; 1786 1787 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1788 if (Casts.empty()) 1789 return; 1790 // Only the first Cast instruction in the Casts vector is of interest. 1791 // The rest of the Casts (if exist) have no uses outside the 1792 // induction update chain itself. 1793 Instruction *CastInst = *Casts.begin(); 1794 if (Lane < UINT_MAX) 1795 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1796 else 1797 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1798 } 1799 1800 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1801 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1802 "Primary induction variable must have an integer type"); 1803 1804 auto II = Legal->getInductionVars()->find(IV); 1805 assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); 1806 1807 auto ID = II->second; 1808 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1809 1810 // The scalar value to broadcast. This will be derived from the canonical 1811 // induction variable. 1812 Value *ScalarIV = nullptr; 1813 1814 // The value from the original loop to which we are mapping the new induction 1815 // variable. 1816 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1817 1818 // True if we have vectorized the induction variable. 1819 auto VectorizedIV = false; 1820 1821 // Determine if we want a scalar version of the induction variable. This is 1822 // true if the induction variable itself is not widened, or if it has at 1823 // least one user in the loop that is not widened. 1824 auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); 1825 1826 // Generate code for the induction step. Note that induction steps are 1827 // required to be loop-invariant 1828 assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && 1829 "Induction step should be loop invariant"); 1830 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1831 Value *Step = nullptr; 1832 if (PSE.getSE()->isSCEVable(IV->getType())) { 1833 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1834 Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), 1835 LoopVectorPreHeader->getTerminator()); 1836 } else { 1837 Step = cast<SCEVUnknown>(ID.getStep())->getValue(); 1838 } 1839 1840 // Try to create a new independent vector induction variable. If we can't 1841 // create the phi node, we will splat the scalar induction variable in each 1842 // loop iteration. 1843 if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) { 1844 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1845 VectorizedIV = true; 1846 } 1847 1848 // If we haven't yet vectorized the induction variable, or if we will create 1849 // a scalar one, we need to define the scalar induction variable and step 1850 // values. If we were given a truncation type, truncate the canonical 1851 // induction variable and step. Otherwise, derive these values from the 1852 // induction descriptor. 1853 if (!VectorizedIV || NeedsScalarIV) { 1854 ScalarIV = Induction; 1855 if (IV != OldInduction) { 1856 ScalarIV = IV->getType()->isIntegerTy() 1857 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1858 : Builder.CreateCast(Instruction::SIToFP, Induction, 1859 IV->getType()); 1860 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1861 ScalarIV->setName("offset.idx"); 1862 } 1863 if (Trunc) { 1864 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1865 assert(Step->getType()->isIntegerTy() && 1866 "Truncation requires an integer step"); 1867 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1868 Step = Builder.CreateTrunc(Step, TruncType); 1869 } 1870 } 1871 1872 // If we haven't yet vectorized the induction variable, splat the scalar 1873 // induction variable, and build the necessary step vectors. 1874 // TODO: Don't do it unless the vectorized IV is really required. 1875 if (!VectorizedIV) { 1876 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1877 for (unsigned Part = 0; Part < UF; ++Part) { 1878 Value *EntryPart = 1879 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1880 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1881 if (Trunc) 1882 addMetadata(EntryPart, Trunc); 1883 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1884 } 1885 } 1886 1887 // If an induction variable is only used for counting loop iterations or 1888 // calculating addresses, it doesn't need to be widened. Create scalar steps 1889 // that can be used by instructions we will later scalarize. Note that the 1890 // addition of the scalar steps will not increase the number of instructions 1891 // in the loop in the common case prior to InstCombine. We will be trading 1892 // one vector extract for each scalar step. 1893 if (NeedsScalarIV) 1894 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1895 } 1896 1897 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1898 Instruction::BinaryOps BinOp) { 1899 // Create and check the types. 1900 assert(Val->getType()->isVectorTy() && "Must be a vector"); 1901 int VLen = Val->getType()->getVectorNumElements(); 1902 1903 Type *STy = Val->getType()->getScalarType(); 1904 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1905 "Induction Step must be an integer or FP"); 1906 assert(Step->getType() == STy && "Step has wrong type"); 1907 1908 SmallVector<Constant *, 8> Indices; 1909 1910 if (STy->isIntegerTy()) { 1911 // Create a vector of consecutive numbers from zero to VF. 1912 for (int i = 0; i < VLen; ++i) 1913 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1914 1915 // Add the consecutive indices to the vector value. 1916 Constant *Cv = ConstantVector::get(Indices); 1917 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1918 Step = Builder.CreateVectorSplat(VLen, Step); 1919 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1920 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1921 // which can be found from the original scalar operations. 1922 Step = Builder.CreateMul(Cv, Step); 1923 return Builder.CreateAdd(Val, Step, "induction"); 1924 } 1925 1926 // Floating point induction. 1927 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1928 "Binary Opcode should be specified for FP induction"); 1929 // Create a vector of consecutive numbers from zero to VF. 1930 for (int i = 0; i < VLen; ++i) 1931 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1932 1933 // Add the consecutive indices to the vector value. 1934 Constant *Cv = ConstantVector::get(Indices); 1935 1936 Step = Builder.CreateVectorSplat(VLen, Step); 1937 1938 // Floating point operations had to be 'fast' to enable the induction. 1939 FastMathFlags Flags; 1940 Flags.setFast(); 1941 1942 Value *MulOp = Builder.CreateFMul(Cv, Step); 1943 if (isa<Instruction>(MulOp)) 1944 // Have to check, MulOp may be a constant 1945 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1946 1947 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1948 if (isa<Instruction>(BOp)) 1949 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1950 return BOp; 1951 } 1952 1953 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1954 Instruction *EntryVal, 1955 const InductionDescriptor &ID) { 1956 // We shouldn't have to build scalar steps if we aren't vectorizing. 1957 assert(VF > 1 && "VF should be greater than one"); 1958 1959 // Get the value type and ensure it and the step have the same integer type. 1960 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1961 assert(ScalarIVTy == Step->getType() && 1962 "Val and Step should have the same type"); 1963 1964 // We build scalar steps for both integer and floating-point induction 1965 // variables. Here, we determine the kind of arithmetic we will perform. 1966 Instruction::BinaryOps AddOp; 1967 Instruction::BinaryOps MulOp; 1968 if (ScalarIVTy->isIntegerTy()) { 1969 AddOp = Instruction::Add; 1970 MulOp = Instruction::Mul; 1971 } else { 1972 AddOp = ID.getInductionOpcode(); 1973 MulOp = Instruction::FMul; 1974 } 1975 1976 // Determine the number of scalars we need to generate for each unroll 1977 // iteration. If EntryVal is uniform, we only need to generate the first 1978 // lane. Otherwise, we generate all VF values. 1979 unsigned Lanes = 1980 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 1981 : VF; 1982 // Compute the scalar steps and save the results in VectorLoopValueMap. 1983 for (unsigned Part = 0; Part < UF; ++Part) { 1984 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 1985 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 1986 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 1987 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 1988 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 1989 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 1990 } 1991 } 1992 } 1993 1994 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 1995 assert(V != Induction && "The new induction variable should not be used."); 1996 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 1997 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 1998 1999 // If we have a stride that is replaced by one, do it here. Defer this for 2000 // the VPlan-native path until we start running Legal checks in that path. 2001 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2002 V = ConstantInt::get(V->getType(), 1); 2003 2004 // If we have a vector mapped to this value, return it. 2005 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2006 return VectorLoopValueMap.getVectorValue(V, Part); 2007 2008 // If the value has not been vectorized, check if it has been scalarized 2009 // instead. If it has been scalarized, and we actually need the value in 2010 // vector form, we will construct the vector values on demand. 2011 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2012 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2013 2014 // If we've scalarized a value, that value should be an instruction. 2015 auto *I = cast<Instruction>(V); 2016 2017 // If we aren't vectorizing, we can just copy the scalar map values over to 2018 // the vector map. 2019 if (VF == 1) { 2020 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2021 return ScalarValue; 2022 } 2023 2024 // Get the last scalar instruction we generated for V and Part. If the value 2025 // is known to be uniform after vectorization, this corresponds to lane zero 2026 // of the Part unroll iteration. Otherwise, the last instruction is the one 2027 // we created for the last vector lane of the Part unroll iteration. 2028 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 2029 auto *LastInst = cast<Instruction>( 2030 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2031 2032 // Set the insert point after the last scalarized instruction. This ensures 2033 // the insertelement sequence will directly follow the scalar definitions. 2034 auto OldIP = Builder.saveIP(); 2035 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2036 Builder.SetInsertPoint(&*NewIP); 2037 2038 // However, if we are vectorizing, we need to construct the vector values. 2039 // If the value is known to be uniform after vectorization, we can just 2040 // broadcast the scalar value corresponding to lane zero for each unroll 2041 // iteration. Otherwise, we construct the vector values using insertelement 2042 // instructions. Since the resulting vectors are stored in 2043 // VectorLoopValueMap, we will only generate the insertelements once. 2044 Value *VectorValue = nullptr; 2045 if (Cost->isUniformAfterVectorization(I, VF)) { 2046 VectorValue = getBroadcastInstrs(ScalarValue); 2047 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2048 } else { 2049 // Initialize packing with insertelements to start from undef. 2050 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2051 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2052 for (unsigned Lane = 0; Lane < VF; ++Lane) 2053 packScalarIntoVectorValue(V, {Part, Lane}); 2054 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2055 } 2056 Builder.restoreIP(OldIP); 2057 return VectorValue; 2058 } 2059 2060 // If this scalar is unknown, assume that it is a constant or that it is 2061 // loop invariant. Broadcast V and save the value for future uses. 2062 Value *B = getBroadcastInstrs(V); 2063 VectorLoopValueMap.setVectorValue(V, Part, B); 2064 return B; 2065 } 2066 2067 Value * 2068 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2069 const VPIteration &Instance) { 2070 // If the value is not an instruction contained in the loop, it should 2071 // already be scalar. 2072 if (OrigLoop->isLoopInvariant(V)) 2073 return V; 2074 2075 assert(Instance.Lane > 0 2076 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2077 : true && "Uniform values only have lane zero"); 2078 2079 // If the value from the original loop has not been vectorized, it is 2080 // represented by UF x VF scalar values in the new loop. Return the requested 2081 // scalar value. 2082 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2083 return VectorLoopValueMap.getScalarValue(V, Instance); 2084 2085 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2086 // for the given unroll part. If this entry is not a vector type (i.e., the 2087 // vectorization factor is one), there is no need to generate an 2088 // extractelement instruction. 2089 auto *U = getOrCreateVectorValue(V, Instance.Part); 2090 if (!U->getType()->isVectorTy()) { 2091 assert(VF == 1 && "Value not scalarized has non-vector type"); 2092 return U; 2093 } 2094 2095 // Otherwise, the value from the original loop has been vectorized and is 2096 // represented by UF vector values. Extract and return the requested scalar 2097 // value from the appropriate vector lane. 2098 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2099 } 2100 2101 void InnerLoopVectorizer::packScalarIntoVectorValue( 2102 Value *V, const VPIteration &Instance) { 2103 assert(V != Induction && "The new induction variable should not be used."); 2104 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2105 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2106 2107 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2108 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2109 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2110 Builder.getInt32(Instance.Lane)); 2111 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2112 } 2113 2114 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2115 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2116 SmallVector<Constant *, 8> ShuffleMask; 2117 for (unsigned i = 0; i < VF; ++i) 2118 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 2119 2120 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2121 ConstantVector::get(ShuffleMask), 2122 "reverse"); 2123 } 2124 2125 // Return whether we allow using masked interleave-groups (for dealing with 2126 // strided loads/stores that reside in predicated blocks, or for dealing 2127 // with gaps). 2128 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2129 // If an override option has been passed in for interleaved accesses, use it. 2130 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2131 return EnableMaskedInterleavedMemAccesses; 2132 2133 return TTI.enableMaskedInterleavedAccessVectorization(); 2134 } 2135 2136 // Try to vectorize the interleave group that \p Instr belongs to. 2137 // 2138 // E.g. Translate following interleaved load group (factor = 3): 2139 // for (i = 0; i < N; i+=3) { 2140 // R = Pic[i]; // Member of index 0 2141 // G = Pic[i+1]; // Member of index 1 2142 // B = Pic[i+2]; // Member of index 2 2143 // ... // do something to R, G, B 2144 // } 2145 // To: 2146 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2147 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2148 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2149 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2150 // 2151 // Or translate following interleaved store group (factor = 3): 2152 // for (i = 0; i < N; i+=3) { 2153 // ... do something to R, G, B 2154 // Pic[i] = R; // Member of index 0 2155 // Pic[i+1] = G; // Member of index 1 2156 // Pic[i+2] = B; // Member of index 2 2157 // } 2158 // To: 2159 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2160 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2161 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2162 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2163 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2164 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, 2165 VectorParts *BlockInMask) { 2166 const InterleaveGroup<Instruction> *Group = 2167 Cost->getInterleavedAccessGroup(Instr); 2168 assert(Group && "Fail to get an interleaved access group."); 2169 2170 // Skip if current instruction is not the insert position. 2171 if (Instr != Group->getInsertPos()) 2172 return; 2173 2174 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2175 Value *Ptr = getLoadStorePointerOperand(Instr); 2176 2177 // Prepare for the vector type of the interleaved load/store. 2178 Type *ScalarTy = getMemInstValueType(Instr); 2179 unsigned InterleaveFactor = Group->getFactor(); 2180 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2181 Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr)); 2182 2183 // Prepare for the new pointers. 2184 setDebugLocFromInst(Builder, Ptr); 2185 SmallVector<Value *, 2> NewPtrs; 2186 unsigned Index = Group->getIndex(Instr); 2187 2188 VectorParts Mask; 2189 bool IsMaskForCondRequired = BlockInMask; 2190 if (IsMaskForCondRequired) { 2191 Mask = *BlockInMask; 2192 // TODO: extend the masked interleaved-group support to reversed access. 2193 assert(!Group->isReverse() && "Reversed masked interleave-group " 2194 "not supported."); 2195 } 2196 2197 // If the group is reverse, adjust the index to refer to the last vector lane 2198 // instead of the first. We adjust the index from the first vector lane, 2199 // rather than directly getting the pointer for lane VF - 1, because the 2200 // pointer operand of the interleaved access is supposed to be uniform. For 2201 // uniform instructions, we're only required to generate a value for the 2202 // first vector lane in each unroll iteration. 2203 if (Group->isReverse()) 2204 Index += (VF - 1) * Group->getFactor(); 2205 2206 bool InBounds = false; 2207 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2208 InBounds = gep->isInBounds(); 2209 2210 for (unsigned Part = 0; Part < UF; Part++) { 2211 Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); 2212 2213 // Notice current instruction could be any index. Need to adjust the address 2214 // to the member of index 0. 2215 // 2216 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2217 // b = A[i]; // Member of index 0 2218 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2219 // 2220 // E.g. A[i+1] = a; // Member of index 1 2221 // A[i] = b; // Member of index 0 2222 // A[i+2] = c; // Member of index 2 (Current instruction) 2223 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2224 NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index)); 2225 if (InBounds) 2226 cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true); 2227 2228 // Cast to the vector pointer type. 2229 NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); 2230 } 2231 2232 setDebugLocFromInst(Builder, Instr); 2233 Value *UndefVec = UndefValue::get(VecTy); 2234 2235 Value *MaskForGaps = nullptr; 2236 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2237 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2238 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2239 } 2240 2241 // Vectorize the interleaved load group. 2242 if (isa<LoadInst>(Instr)) { 2243 // For each unroll part, create a wide load for the group. 2244 SmallVector<Value *, 2> NewLoads; 2245 for (unsigned Part = 0; Part < UF; Part++) { 2246 Instruction *NewLoad; 2247 if (IsMaskForCondRequired || MaskForGaps) { 2248 assert(useMaskedInterleavedAccesses(*TTI) && 2249 "masked interleaved groups are not allowed."); 2250 Value *GroupMask = MaskForGaps; 2251 if (IsMaskForCondRequired) { 2252 auto *Undefs = UndefValue::get(Mask[Part]->getType()); 2253 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2254 Value *ShuffledMask = Builder.CreateShuffleVector( 2255 Mask[Part], Undefs, RepMask, "interleaved.mask"); 2256 GroupMask = MaskForGaps 2257 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2258 MaskForGaps) 2259 : ShuffledMask; 2260 } 2261 NewLoad = 2262 Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), 2263 GroupMask, UndefVec, "wide.masked.vec"); 2264 } 2265 else 2266 NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part], 2267 Group->getAlignment(), "wide.vec"); 2268 Group->addMetadata(NewLoad); 2269 NewLoads.push_back(NewLoad); 2270 } 2271 2272 // For each member in the group, shuffle out the appropriate data from the 2273 // wide loads. 2274 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2275 Instruction *Member = Group->getMember(I); 2276 2277 // Skip the gaps in the group. 2278 if (!Member) 2279 continue; 2280 2281 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); 2282 for (unsigned Part = 0; Part < UF; Part++) { 2283 Value *StridedVec = Builder.CreateShuffleVector( 2284 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2285 2286 // If this member has different type, cast the result type. 2287 if (Member->getType() != ScalarTy) { 2288 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2289 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2290 } 2291 2292 if (Group->isReverse()) 2293 StridedVec = reverseVector(StridedVec); 2294 2295 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2296 } 2297 } 2298 return; 2299 } 2300 2301 // The sub vector type for current instruction. 2302 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2303 2304 // Vectorize the interleaved store group. 2305 for (unsigned Part = 0; Part < UF; Part++) { 2306 // Collect the stored vector from each member. 2307 SmallVector<Value *, 4> StoredVecs; 2308 for (unsigned i = 0; i < InterleaveFactor; i++) { 2309 // Interleaved store group doesn't allow a gap, so each index has a member 2310 Instruction *Member = Group->getMember(i); 2311 assert(Member && "Fail to get a member from an interleaved store group"); 2312 2313 Value *StoredVec = getOrCreateVectorValue( 2314 cast<StoreInst>(Member)->getValueOperand(), Part); 2315 if (Group->isReverse()) 2316 StoredVec = reverseVector(StoredVec); 2317 2318 // If this member has different type, cast it to a unified type. 2319 2320 if (StoredVec->getType() != SubVT) 2321 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2322 2323 StoredVecs.push_back(StoredVec); 2324 } 2325 2326 // Concatenate all vectors into a wide vector. 2327 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2328 2329 // Interleave the elements in the wide vector. 2330 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); 2331 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, 2332 "interleaved.vec"); 2333 2334 Instruction *NewStoreInstr; 2335 if (IsMaskForCondRequired) { 2336 auto *Undefs = UndefValue::get(Mask[Part]->getType()); 2337 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2338 Value *ShuffledMask = Builder.CreateShuffleVector( 2339 Mask[Part], Undefs, RepMask, "interleaved.mask"); 2340 NewStoreInstr = Builder.CreateMaskedStore( 2341 IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask); 2342 } 2343 else 2344 NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], 2345 Group->getAlignment()); 2346 2347 Group->addMetadata(NewStoreInstr); 2348 } 2349 } 2350 2351 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2352 VectorParts *BlockInMask) { 2353 // Attempt to issue a wide load. 2354 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2355 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2356 2357 assert((LI || SI) && "Invalid Load/Store instruction"); 2358 2359 LoopVectorizationCostModel::InstWidening Decision = 2360 Cost->getWideningDecision(Instr, VF); 2361 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 2362 "CM decision should be taken at this point"); 2363 if (Decision == LoopVectorizationCostModel::CM_Interleave) 2364 return vectorizeInterleaveGroup(Instr); 2365 2366 Type *ScalarDataTy = getMemInstValueType(Instr); 2367 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2368 Value *Ptr = getLoadStorePointerOperand(Instr); 2369 // An alignment of 0 means target abi alignment. We need to use the scalar's 2370 // target abi alignment in such a case. 2371 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2372 const Align Alignment = 2373 DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); 2374 unsigned AddressSpace = getLoadStoreAddressSpace(Instr); 2375 2376 // Determine if the pointer operand of the access is either consecutive or 2377 // reverse consecutive. 2378 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2379 bool ConsecutiveStride = 2380 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2381 bool CreateGatherScatter = 2382 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2383 2384 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2385 // gather/scatter. Otherwise Decision should have been to Scalarize. 2386 assert((ConsecutiveStride || CreateGatherScatter) && 2387 "The instruction should be scalarized"); 2388 2389 // Handle consecutive loads/stores. 2390 if (ConsecutiveStride) 2391 Ptr = getOrCreateScalarValue(Ptr, {0, 0}); 2392 2393 VectorParts Mask; 2394 bool isMaskRequired = BlockInMask; 2395 if (isMaskRequired) 2396 Mask = *BlockInMask; 2397 2398 bool InBounds = false; 2399 if (auto *gep = dyn_cast<GetElementPtrInst>( 2400 getLoadStorePointerOperand(Instr)->stripPointerCasts())) 2401 InBounds = gep->isInBounds(); 2402 2403 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2404 // Calculate the pointer for the specific unroll-part. 2405 GetElementPtrInst *PartPtr = nullptr; 2406 2407 if (Reverse) { 2408 // If the address is consecutive but reversed, then the 2409 // wide store needs to start at the last vector element. 2410 PartPtr = cast<GetElementPtrInst>( 2411 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2412 PartPtr->setIsInBounds(InBounds); 2413 PartPtr = cast<GetElementPtrInst>( 2414 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2415 PartPtr->setIsInBounds(InBounds); 2416 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2417 Mask[Part] = reverseVector(Mask[Part]); 2418 } else { 2419 PartPtr = cast<GetElementPtrInst>( 2420 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2421 PartPtr->setIsInBounds(InBounds); 2422 } 2423 2424 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2425 }; 2426 2427 // Handle Stores: 2428 if (SI) { 2429 setDebugLocFromInst(Builder, SI); 2430 2431 for (unsigned Part = 0; Part < UF; ++Part) { 2432 Instruction *NewSI = nullptr; 2433 Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); 2434 if (CreateGatherScatter) { 2435 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; 2436 Value *VectorGep = getOrCreateVectorValue(Ptr, Part); 2437 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, 2438 Alignment.value(), MaskPart); 2439 } else { 2440 if (Reverse) { 2441 // If we store to reverse consecutive memory locations, then we need 2442 // to reverse the order of elements in the stored value. 2443 StoredVal = reverseVector(StoredVal); 2444 // We don't want to update the value in the map as it might be used in 2445 // another expression. So don't call resetVectorValue(StoredVal). 2446 } 2447 auto *VecPtr = CreateVecPtr(Part, Ptr); 2448 if (isMaskRequired) 2449 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, 2450 Alignment.value(), Mask[Part]); 2451 else 2452 NewSI = 2453 Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value()); 2454 } 2455 addMetadata(NewSI, SI); 2456 } 2457 return; 2458 } 2459 2460 // Handle loads. 2461 assert(LI && "Must have a load instruction"); 2462 setDebugLocFromInst(Builder, LI); 2463 for (unsigned Part = 0; Part < UF; ++Part) { 2464 Value *NewLI; 2465 if (CreateGatherScatter) { 2466 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; 2467 Value *VectorGep = getOrCreateVectorValue(Ptr, Part); 2468 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart, 2469 nullptr, "wide.masked.gather"); 2470 addMetadata(NewLI, LI); 2471 } else { 2472 auto *VecPtr = CreateVecPtr(Part, Ptr); 2473 if (isMaskRequired) 2474 NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part], 2475 UndefValue::get(DataTy), 2476 "wide.masked.load"); 2477 else 2478 NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(), 2479 "wide.load"); 2480 2481 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2482 addMetadata(NewLI, LI); 2483 if (Reverse) 2484 NewLI = reverseVector(NewLI); 2485 } 2486 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2487 } 2488 } 2489 2490 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2491 const VPIteration &Instance, 2492 bool IfPredicateInstr) { 2493 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2494 2495 setDebugLocFromInst(Builder, Instr); 2496 2497 // Does this instruction return a value ? 2498 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2499 2500 Instruction *Cloned = Instr->clone(); 2501 if (!IsVoidRetTy) 2502 Cloned->setName(Instr->getName() + ".cloned"); 2503 2504 // Replace the operands of the cloned instructions with their scalar 2505 // equivalents in the new loop. 2506 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2507 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); 2508 Cloned->setOperand(op, NewOp); 2509 } 2510 addNewMetadata(Cloned, Instr); 2511 2512 // Place the cloned scalar in the new loop. 2513 Builder.Insert(Cloned); 2514 2515 // Add the cloned scalar to the scalar map entry. 2516 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2517 2518 // If we just cloned a new assumption, add it the assumption cache. 2519 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2520 if (II->getIntrinsicID() == Intrinsic::assume) 2521 AC->registerAssumption(II); 2522 2523 // End if-block. 2524 if (IfPredicateInstr) 2525 PredicatedInstructions.push_back(Cloned); 2526 } 2527 2528 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2529 Value *End, Value *Step, 2530 Instruction *DL) { 2531 BasicBlock *Header = L->getHeader(); 2532 BasicBlock *Latch = L->getLoopLatch(); 2533 // As we're just creating this loop, it's possible no latch exists 2534 // yet. If so, use the header as this will be a single block loop. 2535 if (!Latch) 2536 Latch = Header; 2537 2538 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2539 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2540 setDebugLocFromInst(Builder, OldInst); 2541 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2542 2543 Builder.SetInsertPoint(Latch->getTerminator()); 2544 setDebugLocFromInst(Builder, OldInst); 2545 2546 // Create i+1 and fill the PHINode. 2547 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2548 Induction->addIncoming(Start, L->getLoopPreheader()); 2549 Induction->addIncoming(Next, Latch); 2550 // Create the compare. 2551 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2552 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2553 2554 // Now we have two terminators. Remove the old one from the block. 2555 Latch->getTerminator()->eraseFromParent(); 2556 2557 return Induction; 2558 } 2559 2560 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2561 if (TripCount) 2562 return TripCount; 2563 2564 assert(L && "Create Trip Count for null loop."); 2565 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2566 // Find the loop boundaries. 2567 ScalarEvolution *SE = PSE.getSE(); 2568 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2569 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2570 "Invalid loop count"); 2571 2572 Type *IdxTy = Legal->getWidestInductionType(); 2573 assert(IdxTy && "No type for induction"); 2574 2575 // The exit count might have the type of i64 while the phi is i32. This can 2576 // happen if we have an induction variable that is sign extended before the 2577 // compare. The only way that we get a backedge taken count is that the 2578 // induction variable was signed and as such will not overflow. In such a case 2579 // truncation is legal. 2580 if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > 2581 IdxTy->getPrimitiveSizeInBits()) 2582 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2583 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2584 2585 // Get the total trip count from the count by adding 1. 2586 const SCEV *ExitCount = SE->getAddExpr( 2587 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2588 2589 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2590 2591 // Expand the trip count and place the new instructions in the preheader. 2592 // Notice that the pre-header does not change, only the loop body. 2593 SCEVExpander Exp(*SE, DL, "induction"); 2594 2595 // Count holds the overall loop count (N). 2596 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2597 L->getLoopPreheader()->getTerminator()); 2598 2599 if (TripCount->getType()->isPointerTy()) 2600 TripCount = 2601 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2602 L->getLoopPreheader()->getTerminator()); 2603 2604 return TripCount; 2605 } 2606 2607 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2608 if (VectorTripCount) 2609 return VectorTripCount; 2610 2611 Value *TC = getOrCreateTripCount(L); 2612 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2613 2614 Type *Ty = TC->getType(); 2615 Constant *Step = ConstantInt::get(Ty, VF * UF); 2616 2617 // If the tail is to be folded by masking, round the number of iterations N 2618 // up to a multiple of Step instead of rounding down. This is done by first 2619 // adding Step-1 and then rounding down. Note that it's ok if this addition 2620 // overflows: the vector induction variable will eventually wrap to zero given 2621 // that it starts at zero and its Step is a power of two; the loop will then 2622 // exit, with the last early-exit vector comparison also producing all-true. 2623 if (Cost->foldTailByMasking()) { 2624 assert(isPowerOf2_32(VF * UF) && 2625 "VF*UF must be a power of 2 when folding tail by masking"); 2626 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2627 } 2628 2629 // Now we need to generate the expression for the part of the loop that the 2630 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2631 // iterations are not required for correctness, or N - Step, otherwise. Step 2632 // is equal to the vectorization factor (number of SIMD elements) times the 2633 // unroll factor (number of SIMD instructions). 2634 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2635 2636 // If there is a non-reversed interleaved group that may speculatively access 2637 // memory out-of-bounds, we need to ensure that there will be at least one 2638 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2639 // the trip count, we set the remainder to be equal to the step. If the step 2640 // does not evenly divide the trip count, no adjustment is necessary since 2641 // there will already be scalar iterations. Note that the minimum iterations 2642 // check ensures that N >= Step. 2643 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2644 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2645 R = Builder.CreateSelect(IsZero, Step, R); 2646 } 2647 2648 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2649 2650 return VectorTripCount; 2651 } 2652 2653 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2654 const DataLayout &DL) { 2655 // Verify that V is a vector type with same number of elements as DstVTy. 2656 unsigned VF = DstVTy->getNumElements(); 2657 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2658 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2659 Type *SrcElemTy = SrcVecTy->getElementType(); 2660 Type *DstElemTy = DstVTy->getElementType(); 2661 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2662 "Vector elements must have same size"); 2663 2664 // Do a direct cast if element types are castable. 2665 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2666 return Builder.CreateBitOrPointerCast(V, DstVTy); 2667 } 2668 // V cannot be directly casted to desired vector type. 2669 // May happen when V is a floating point vector but DstVTy is a vector of 2670 // pointers or vice-versa. Handle this using a two-step bitcast using an 2671 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2672 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2673 "Only one type should be a pointer type"); 2674 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2675 "Only one type should be a floating point type"); 2676 Type *IntTy = 2677 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2678 VectorType *VecIntTy = VectorType::get(IntTy, VF); 2679 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2680 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2681 } 2682 2683 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2684 BasicBlock *Bypass) { 2685 Value *Count = getOrCreateTripCount(L); 2686 // Reuse existing vector loop preheader for TC checks. 2687 // Note that new preheader block is generated for vector loop. 2688 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2689 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2690 2691 // Generate code to check if the loop's trip count is less than VF * UF, or 2692 // equal to it in case a scalar epilogue is required; this implies that the 2693 // vector trip count is zero. This check also covers the case where adding one 2694 // to the backedge-taken count overflowed leading to an incorrect trip count 2695 // of zero. In this case we will also jump to the scalar loop. 2696 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2697 : ICmpInst::ICMP_ULT; 2698 2699 // If tail is to be folded, vector loop takes care of all iterations. 2700 Value *CheckMinIters = Builder.getFalse(); 2701 if (!Cost->foldTailByMasking()) 2702 CheckMinIters = Builder.CreateICmp( 2703 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2704 "min.iters.check"); 2705 2706 // Create new preheader for vector loop. 2707 LoopVectorPreHeader = 2708 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2709 "vector.ph"); 2710 2711 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2712 DT->getNode(Bypass)->getIDom()) && 2713 "TC check is expected to dominate Bypass"); 2714 2715 // Update dominator for Bypass & LoopExit. 2716 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2717 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2718 2719 ReplaceInstWithInst( 2720 TCCheckBlock->getTerminator(), 2721 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2722 LoopBypassBlocks.push_back(TCCheckBlock); 2723 } 2724 2725 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2726 // Reuse existing vector loop preheader for SCEV checks. 2727 // Note that new preheader block is generated for vector loop. 2728 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2729 2730 // Generate the code to check that the SCEV assumptions that we made. 2731 // We want the new basic block to start at the first instruction in a 2732 // sequence of instructions that form a check. 2733 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2734 "scev.check"); 2735 Value *SCEVCheck = Exp.expandCodeForPredicate( 2736 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2737 2738 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2739 if (C->isZero()) 2740 return; 2741 2742 assert(!SCEVCheckBlock->getParent()->hasOptSize() && 2743 "Cannot SCEV check stride or overflow when optimizing for size"); 2744 2745 SCEVCheckBlock->setName("vector.scevcheck"); 2746 // Create new preheader for vector loop. 2747 LoopVectorPreHeader = 2748 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2749 nullptr, "vector.ph"); 2750 2751 // Update dominator only if this is first RT check. 2752 if (LoopBypassBlocks.empty()) { 2753 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2754 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2755 } 2756 2757 ReplaceInstWithInst( 2758 SCEVCheckBlock->getTerminator(), 2759 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2760 LoopBypassBlocks.push_back(SCEVCheckBlock); 2761 AddedSafetyChecks = true; 2762 } 2763 2764 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2765 // VPlan-native path does not do any analysis for runtime checks currently. 2766 if (EnableVPlanNativePath) 2767 return; 2768 2769 // Reuse existing vector loop preheader for runtime memory checks. 2770 // Note that new preheader block is generated for vector loop. 2771 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2772 2773 // Generate the code that checks in runtime if arrays overlap. We put the 2774 // checks into a separate block to make the more common case of few elements 2775 // faster. 2776 Instruction *FirstCheckInst; 2777 Instruction *MemRuntimeCheck; 2778 std::tie(FirstCheckInst, MemRuntimeCheck) = 2779 Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); 2780 if (!MemRuntimeCheck) 2781 return; 2782 2783 if (MemCheckBlock->getParent()->hasOptSize()) { 2784 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2785 "Cannot emit memory checks when optimizing for size, unless forced " 2786 "to vectorize."); 2787 ORE->emit([&]() { 2788 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2789 L->getStartLoc(), L->getHeader()) 2790 << "Code-size may be reduced by not forcing " 2791 "vectorization, or by source-code modifications " 2792 "eliminating the need for runtime checks " 2793 "(e.g., adding 'restrict')."; 2794 }); 2795 } 2796 2797 MemCheckBlock->setName("vector.memcheck"); 2798 // Create new preheader for vector loop. 2799 LoopVectorPreHeader = 2800 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2801 "vector.ph"); 2802 2803 // Update dominator only if this is first RT check. 2804 if (LoopBypassBlocks.empty()) { 2805 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2806 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2807 } 2808 2809 ReplaceInstWithInst( 2810 MemCheckBlock->getTerminator(), 2811 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2812 LoopBypassBlocks.push_back(MemCheckBlock); 2813 AddedSafetyChecks = true; 2814 2815 // We currently don't use LoopVersioning for the actual loop cloning but we 2816 // still use it to add the noalias metadata. 2817 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2818 PSE.getSE()); 2819 LVer->prepareNoAliasMetadata(); 2820 } 2821 2822 Value *InnerLoopVectorizer::emitTransformedIndex( 2823 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2824 const InductionDescriptor &ID) const { 2825 2826 SCEVExpander Exp(*SE, DL, "induction"); 2827 auto Step = ID.getStep(); 2828 auto StartValue = ID.getStartValue(); 2829 assert(Index->getType() == Step->getType() && 2830 "Index type does not match StepValue type"); 2831 2832 // Note: the IR at this point is broken. We cannot use SE to create any new 2833 // SCEV and then expand it, hoping that SCEV's simplification will give us 2834 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2835 // lead to various SCEV crashes. So all we can do is to use builder and rely 2836 // on InstCombine for future simplifications. Here we handle some trivial 2837 // cases only. 2838 auto CreateAdd = [&B](Value *X, Value *Y) { 2839 assert(X->getType() == Y->getType() && "Types don't match!"); 2840 if (auto *CX = dyn_cast<ConstantInt>(X)) 2841 if (CX->isZero()) 2842 return Y; 2843 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2844 if (CY->isZero()) 2845 return X; 2846 return B.CreateAdd(X, Y); 2847 }; 2848 2849 auto CreateMul = [&B](Value *X, Value *Y) { 2850 assert(X->getType() == Y->getType() && "Types don't match!"); 2851 if (auto *CX = dyn_cast<ConstantInt>(X)) 2852 if (CX->isOne()) 2853 return Y; 2854 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2855 if (CY->isOne()) 2856 return X; 2857 return B.CreateMul(X, Y); 2858 }; 2859 2860 switch (ID.getKind()) { 2861 case InductionDescriptor::IK_IntInduction: { 2862 assert(Index->getType() == StartValue->getType() && 2863 "Index type does not match StartValue type"); 2864 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2865 return B.CreateSub(StartValue, Index); 2866 auto *Offset = CreateMul( 2867 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2868 return CreateAdd(StartValue, Offset); 2869 } 2870 case InductionDescriptor::IK_PtrInduction: { 2871 assert(isa<SCEVConstant>(Step) && 2872 "Expected constant step for pointer induction"); 2873 return B.CreateGEP( 2874 StartValue->getType()->getPointerElementType(), StartValue, 2875 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2876 &*B.GetInsertPoint()))); 2877 } 2878 case InductionDescriptor::IK_FpInduction: { 2879 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2880 auto InductionBinOp = ID.getInductionBinOp(); 2881 assert(InductionBinOp && 2882 (InductionBinOp->getOpcode() == Instruction::FAdd || 2883 InductionBinOp->getOpcode() == Instruction::FSub) && 2884 "Original bin op should be defined for FP induction"); 2885 2886 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2887 2888 // Floating point operations had to be 'fast' to enable the induction. 2889 FastMathFlags Flags; 2890 Flags.setFast(); 2891 2892 Value *MulExp = B.CreateFMul(StepValue, Index); 2893 if (isa<Instruction>(MulExp)) 2894 // We have to check, the MulExp may be a constant. 2895 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2896 2897 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2898 "induction"); 2899 if (isa<Instruction>(BOp)) 2900 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2901 2902 return BOp; 2903 } 2904 case InductionDescriptor::IK_NoInduction: 2905 return nullptr; 2906 } 2907 llvm_unreachable("invalid enum"); 2908 } 2909 2910 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2911 /* 2912 In this function we generate a new loop. The new loop will contain 2913 the vectorized instructions while the old loop will continue to run the 2914 scalar remainder. 2915 2916 [ ] <-- loop iteration number check. 2917 / | 2918 / v 2919 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2920 | / | 2921 | / v 2922 || [ ] <-- vector pre header. 2923 |/ | 2924 | v 2925 | [ ] \ 2926 | [ ]_| <-- vector loop. 2927 | | 2928 | v 2929 | -[ ] <--- middle-block. 2930 | / | 2931 | / v 2932 -|- >[ ] <--- new preheader. 2933 | | 2934 | v 2935 | [ ] \ 2936 | [ ]_| <-- old scalar loop to handle remainder. 2937 \ | 2938 \ v 2939 >[ ] <-- exit block. 2940 ... 2941 */ 2942 2943 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2944 2945 // Some loops have a single integer induction variable, while other loops 2946 // don't. One example is c++ iterators that often have multiple pointer 2947 // induction variables. In the code below we also support a case where we 2948 // don't have a single induction variable. 2949 // 2950 // We try to obtain an induction variable from the original loop as hard 2951 // as possible. However if we don't find one that: 2952 // - is an integer 2953 // - counts from zero, stepping by one 2954 // - is the size of the widest induction variable type 2955 // then we create a new one. 2956 OldInduction = Legal->getPrimaryInduction(); 2957 Type *IdxTy = Legal->getWidestInductionType(); 2958 2959 // Split the single block loop into the two loop structure described above. 2960 LoopScalarBody = OrigLoop->getHeader(); 2961 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2962 LoopExitBlock = OrigLoop->getExitBlock(); 2963 assert(LoopExitBlock && "Must have an exit block"); 2964 assert(LoopVectorPreHeader && "Invalid loop structure"); 2965 2966 LoopMiddleBlock = 2967 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2968 LI, nullptr, "middle.block"); 2969 LoopScalarPreHeader = 2970 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2971 nullptr, "scalar.ph"); 2972 // We intentionally don't let SplitBlock to update LoopInfo since 2973 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 2974 // LoopVectorBody is explicitly added to the correct place few lines later. 2975 LoopVectorBody = 2976 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2977 nullptr, nullptr, "vector.body"); 2978 2979 // Update dominator for loop exit. 2980 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 2981 2982 // Create and register the new vector loop. 2983 Loop *Lp = LI->AllocateLoop(); 2984 Loop *ParentLoop = OrigLoop->getParentLoop(); 2985 2986 // Insert the new loop into the loop nest and register the new basic blocks 2987 // before calling any utilities such as SCEV that require valid LoopInfo. 2988 if (ParentLoop) { 2989 ParentLoop->addChildLoop(Lp); 2990 } else { 2991 LI->addTopLevelLoop(Lp); 2992 } 2993 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 2994 2995 // Find the loop boundaries. 2996 Value *Count = getOrCreateTripCount(Lp); 2997 2998 Value *StartIdx = ConstantInt::get(IdxTy, 0); 2999 3000 // Now, compare the new count to zero. If it is zero skip the vector loop and 3001 // jump to the scalar loop. This check also covers the case where the 3002 // backedge-taken count is uint##_max: adding one to it will overflow leading 3003 // to an incorrect trip count of zero. In this (rare) case we will also jump 3004 // to the scalar loop. 3005 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3006 3007 // Generate the code to check any assumptions that we've made for SCEV 3008 // expressions. 3009 emitSCEVChecks(Lp, LoopScalarPreHeader); 3010 3011 // Generate the code that checks in runtime if arrays overlap. We put the 3012 // checks into a separate block to make the more common case of few elements 3013 // faster. 3014 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3015 3016 // Generate the induction variable. 3017 // The loop step is equal to the vectorization factor (num of SIMD elements) 3018 // times the unroll factor (num of SIMD instructions). 3019 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3020 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3021 Induction = 3022 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3023 getDebugLocFromInstOrOperands(OldInduction)); 3024 3025 // We are going to resume the execution of the scalar loop. 3026 // Go over all of the induction variables that we found and fix the 3027 // PHIs that are left in the scalar version of the loop. 3028 // The starting values of PHI nodes depend on the counter of the last 3029 // iteration in the vectorized loop. 3030 // If we come from a bypass edge then we need to start from the original 3031 // start value. 3032 3033 // This variable saves the new starting index for the scalar loop. It is used 3034 // to test if there are any tail iterations left once the vector loop has 3035 // completed. 3036 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 3037 for (auto &InductionEntry : *List) { 3038 PHINode *OrigPhi = InductionEntry.first; 3039 InductionDescriptor II = InductionEntry.second; 3040 3041 // Create phi nodes to merge from the backedge-taken check block. 3042 PHINode *BCResumeVal = 3043 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3044 LoopScalarPreHeader->getTerminator()); 3045 // Copy original phi DL over to the new one. 3046 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3047 Value *&EndValue = IVEndValues[OrigPhi]; 3048 if (OrigPhi == OldInduction) { 3049 // We know what the end value is. 3050 EndValue = CountRoundDown; 3051 } else { 3052 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 3053 Type *StepType = II.getStep()->getType(); 3054 Instruction::CastOps CastOp = 3055 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 3056 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 3057 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3058 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3059 EndValue->setName("ind.end"); 3060 } 3061 3062 // The new PHI merges the original incoming value, in case of a bypass, 3063 // or the value at the end of the vectorized loop. 3064 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3065 3066 // Fix the scalar body counter (PHI node). 3067 // The old induction's phi node in the scalar body needs the truncated 3068 // value. 3069 for (BasicBlock *BB : LoopBypassBlocks) 3070 BCResumeVal->addIncoming(II.getStartValue(), BB); 3071 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3072 } 3073 3074 // We need the OrigLoop (scalar loop part) latch terminator to help 3075 // produce correct debug info for the middle block BB instructions. 3076 // The legality check stage guarantees that the loop will have a single 3077 // latch. 3078 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3079 "Scalar loop latch terminator isn't a branch"); 3080 BranchInst *ScalarLatchBr = 3081 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3082 3083 // Add a check in the middle block to see if we have completed 3084 // all of the iterations in the first vector loop. 3085 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3086 // If tail is to be folded, we know we don't need to run the remainder. 3087 Value *CmpN = Builder.getTrue(); 3088 if (!Cost->foldTailByMasking()) { 3089 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3090 CountRoundDown, "cmp.n", 3091 LoopMiddleBlock->getTerminator()); 3092 3093 // Here we use the same DebugLoc as the scalar loop latch branch instead 3094 // of the corresponding compare because they may have ended up with 3095 // different line numbers and we want to avoid awkward line stepping while 3096 // debugging. Eg. if the compare has got a line number inside the loop. 3097 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3098 } 3099 3100 BranchInst *BrInst = 3101 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3102 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3103 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3104 3105 // Get ready to start creating new instructions into the vectorized body. 3106 assert(LoopVectorPreHeader == Lp->getLoopPreheader() && 3107 "Inconsistent vector loop preheader"); 3108 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3109 3110 Optional<MDNode *> VectorizedLoopID = 3111 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3112 LLVMLoopVectorizeFollowupVectorized}); 3113 if (VectorizedLoopID.hasValue()) { 3114 Lp->setLoopID(VectorizedLoopID.getValue()); 3115 3116 // Do not setAlreadyVectorized if loop attributes have been defined 3117 // explicitly. 3118 return LoopVectorPreHeader; 3119 } 3120 3121 // Keep all loop hints from the original loop on the vector loop (we'll 3122 // replace the vectorizer-specific hints below). 3123 if (MDNode *LID = OrigLoop->getLoopID()) 3124 Lp->setLoopID(LID); 3125 3126 LoopVectorizeHints Hints(Lp, true, *ORE); 3127 Hints.setAlreadyVectorized(); 3128 3129 #ifdef EXPENSIVE_CHECKS 3130 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3131 LI->verify(*DT); 3132 #endif 3133 3134 return LoopVectorPreHeader; 3135 } 3136 3137 // Fix up external users of the induction variable. At this point, we are 3138 // in LCSSA form, with all external PHIs that use the IV having one input value, 3139 // coming from the remainder loop. We need those PHIs to also have a correct 3140 // value for the IV when arriving directly from the middle block. 3141 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3142 const InductionDescriptor &II, 3143 Value *CountRoundDown, Value *EndValue, 3144 BasicBlock *MiddleBlock) { 3145 // There are two kinds of external IV usages - those that use the value 3146 // computed in the last iteration (the PHI) and those that use the penultimate 3147 // value (the value that feeds into the phi from the loop latch). 3148 // We allow both, but they, obviously, have different values. 3149 3150 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3151 3152 DenseMap<Value *, Value *> MissingVals; 3153 3154 // An external user of the last iteration's value should see the value that 3155 // the remainder loop uses to initialize its own IV. 3156 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3157 for (User *U : PostInc->users()) { 3158 Instruction *UI = cast<Instruction>(U); 3159 if (!OrigLoop->contains(UI)) { 3160 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3161 MissingVals[UI] = EndValue; 3162 } 3163 } 3164 3165 // An external user of the penultimate value need to see EndValue - Step. 3166 // The simplest way to get this is to recompute it from the constituent SCEVs, 3167 // that is Start + (Step * (CRD - 1)). 3168 for (User *U : OrigPhi->users()) { 3169 auto *UI = cast<Instruction>(U); 3170 if (!OrigLoop->contains(UI)) { 3171 const DataLayout &DL = 3172 OrigLoop->getHeader()->getModule()->getDataLayout(); 3173 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3174 3175 IRBuilder<> B(MiddleBlock->getTerminator()); 3176 Value *CountMinusOne = B.CreateSub( 3177 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3178 Value *CMO = 3179 !II.getStep()->getType()->isIntegerTy() 3180 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3181 II.getStep()->getType()) 3182 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3183 CMO->setName("cast.cmo"); 3184 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3185 Escape->setName("ind.escape"); 3186 MissingVals[UI] = Escape; 3187 } 3188 } 3189 3190 for (auto &I : MissingVals) { 3191 PHINode *PHI = cast<PHINode>(I.first); 3192 // One corner case we have to handle is two IVs "chasing" each-other, 3193 // that is %IV2 = phi [...], [ %IV1, %latch ] 3194 // In this case, if IV1 has an external use, we need to avoid adding both 3195 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3196 // don't already have an incoming value for the middle block. 3197 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3198 PHI->addIncoming(I.second, MiddleBlock); 3199 } 3200 } 3201 3202 namespace { 3203 3204 struct CSEDenseMapInfo { 3205 static bool canHandle(const Instruction *I) { 3206 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3207 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3208 } 3209 3210 static inline Instruction *getEmptyKey() { 3211 return DenseMapInfo<Instruction *>::getEmptyKey(); 3212 } 3213 3214 static inline Instruction *getTombstoneKey() { 3215 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3216 } 3217 3218 static unsigned getHashValue(const Instruction *I) { 3219 assert(canHandle(I) && "Unknown instruction!"); 3220 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3221 I->value_op_end())); 3222 } 3223 3224 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3225 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3226 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3227 return LHS == RHS; 3228 return LHS->isIdenticalTo(RHS); 3229 } 3230 }; 3231 3232 } // end anonymous namespace 3233 3234 ///Perform cse of induction variable instructions. 3235 static void cse(BasicBlock *BB) { 3236 // Perform simple cse. 3237 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3238 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3239 Instruction *In = &*I++; 3240 3241 if (!CSEDenseMapInfo::canHandle(In)) 3242 continue; 3243 3244 // Check if we can replace this instruction with any of the 3245 // visited instructions. 3246 if (Instruction *V = CSEMap.lookup(In)) { 3247 In->replaceAllUsesWith(V); 3248 In->eraseFromParent(); 3249 continue; 3250 } 3251 3252 CSEMap[In] = In; 3253 } 3254 } 3255 3256 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3257 unsigned VF, 3258 bool &NeedToScalarize) { 3259 Function *F = CI->getCalledFunction(); 3260 StringRef FnName = CI->getCalledFunction()->getName(); 3261 Type *ScalarRetTy = CI->getType(); 3262 SmallVector<Type *, 4> Tys, ScalarTys; 3263 for (auto &ArgOp : CI->arg_operands()) 3264 ScalarTys.push_back(ArgOp->getType()); 3265 3266 // Estimate cost of scalarized vector call. The source operands are assumed 3267 // to be vectors, so we need to extract individual elements from there, 3268 // execute VF scalar calls, and then gather the result into the vector return 3269 // value. 3270 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3271 if (VF == 1) 3272 return ScalarCallCost; 3273 3274 // Compute corresponding vector type for return value and arguments. 3275 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3276 for (Type *ScalarTy : ScalarTys) 3277 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3278 3279 // Compute costs of unpacking argument values for the scalar calls and 3280 // packing the return values to a vector. 3281 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3282 3283 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3284 3285 // If we can't emit a vector call for this function, then the currently found 3286 // cost is the cost we need to return. 3287 NeedToScalarize = true; 3288 if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) 3289 return Cost; 3290 3291 // If the corresponding vector cost is cheaper, return its cost. 3292 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3293 if (VectorCallCost < Cost) { 3294 NeedToScalarize = false; 3295 return VectorCallCost; 3296 } 3297 return Cost; 3298 } 3299 3300 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3301 unsigned VF) { 3302 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3303 assert(ID && "Expected intrinsic call!"); 3304 3305 FastMathFlags FMF; 3306 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3307 FMF = FPMO->getFastMathFlags(); 3308 3309 SmallVector<Value *, 4> Operands(CI->arg_operands()); 3310 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF); 3311 } 3312 3313 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3314 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3315 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3316 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3317 } 3318 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3319 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3320 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3321 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3322 } 3323 3324 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3325 // For every instruction `I` in MinBWs, truncate the operands, create a 3326 // truncated version of `I` and reextend its result. InstCombine runs 3327 // later and will remove any ext/trunc pairs. 3328 SmallPtrSet<Value *, 4> Erased; 3329 for (const auto &KV : Cost->getMinimalBitwidths()) { 3330 // If the value wasn't vectorized, we must maintain the original scalar 3331 // type. The absence of the value from VectorLoopValueMap indicates that it 3332 // wasn't vectorized. 3333 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3334 continue; 3335 for (unsigned Part = 0; Part < UF; ++Part) { 3336 Value *I = getOrCreateVectorValue(KV.first, Part); 3337 if (Erased.find(I) != Erased.end() || I->use_empty() || 3338 !isa<Instruction>(I)) 3339 continue; 3340 Type *OriginalTy = I->getType(); 3341 Type *ScalarTruncatedTy = 3342 IntegerType::get(OriginalTy->getContext(), KV.second); 3343 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, 3344 OriginalTy->getVectorNumElements()); 3345 if (TruncatedTy == OriginalTy) 3346 continue; 3347 3348 IRBuilder<> B(cast<Instruction>(I)); 3349 auto ShrinkOperand = [&](Value *V) -> Value * { 3350 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3351 if (ZI->getSrcTy() == TruncatedTy) 3352 return ZI->getOperand(0); 3353 return B.CreateZExtOrTrunc(V, TruncatedTy); 3354 }; 3355 3356 // The actual instruction modification depends on the instruction type, 3357 // unfortunately. 3358 Value *NewI = nullptr; 3359 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3360 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3361 ShrinkOperand(BO->getOperand(1))); 3362 3363 // Any wrapping introduced by shrinking this operation shouldn't be 3364 // considered undefined behavior. So, we can't unconditionally copy 3365 // arithmetic wrapping flags to NewI. 3366 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3367 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3368 NewI = 3369 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3370 ShrinkOperand(CI->getOperand(1))); 3371 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3372 NewI = B.CreateSelect(SI->getCondition(), 3373 ShrinkOperand(SI->getTrueValue()), 3374 ShrinkOperand(SI->getFalseValue())); 3375 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3376 switch (CI->getOpcode()) { 3377 default: 3378 llvm_unreachable("Unhandled cast!"); 3379 case Instruction::Trunc: 3380 NewI = ShrinkOperand(CI->getOperand(0)); 3381 break; 3382 case Instruction::SExt: 3383 NewI = B.CreateSExtOrTrunc( 3384 CI->getOperand(0), 3385 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3386 break; 3387 case Instruction::ZExt: 3388 NewI = B.CreateZExtOrTrunc( 3389 CI->getOperand(0), 3390 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3391 break; 3392 } 3393 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3394 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); 3395 auto *O0 = B.CreateZExtOrTrunc( 3396 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3397 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); 3398 auto *O1 = B.CreateZExtOrTrunc( 3399 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3400 3401 NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); 3402 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3403 // Don't do anything with the operands, just extend the result. 3404 continue; 3405 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3406 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements(); 3407 auto *O0 = B.CreateZExtOrTrunc( 3408 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3409 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3410 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3411 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3412 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements(); 3413 auto *O0 = B.CreateZExtOrTrunc( 3414 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3415 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3416 } else { 3417 // If we don't know what to do, be conservative and don't do anything. 3418 continue; 3419 } 3420 3421 // Lastly, extend the result. 3422 NewI->takeName(cast<Instruction>(I)); 3423 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3424 I->replaceAllUsesWith(Res); 3425 cast<Instruction>(I)->eraseFromParent(); 3426 Erased.insert(I); 3427 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3428 } 3429 } 3430 3431 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3432 for (const auto &KV : Cost->getMinimalBitwidths()) { 3433 // If the value wasn't vectorized, we must maintain the original scalar 3434 // type. The absence of the value from VectorLoopValueMap indicates that it 3435 // wasn't vectorized. 3436 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3437 continue; 3438 for (unsigned Part = 0; Part < UF; ++Part) { 3439 Value *I = getOrCreateVectorValue(KV.first, Part); 3440 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3441 if (Inst && Inst->use_empty()) { 3442 Value *NewI = Inst->getOperand(0); 3443 Inst->eraseFromParent(); 3444 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3445 } 3446 } 3447 } 3448 } 3449 3450 void InnerLoopVectorizer::fixVectorizedLoop() { 3451 // Insert truncates and extends for any truncated instructions as hints to 3452 // InstCombine. 3453 if (VF > 1) 3454 truncateToMinimalBitwidths(); 3455 3456 // Fix widened non-induction PHIs by setting up the PHI operands. 3457 if (OrigPHIsToFix.size()) { 3458 assert(EnableVPlanNativePath && 3459 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3460 fixNonInductionPHIs(); 3461 } 3462 3463 // At this point every instruction in the original loop is widened to a 3464 // vector form. Now we need to fix the recurrences in the loop. These PHI 3465 // nodes are currently empty because we did not want to introduce cycles. 3466 // This is the second stage of vectorizing recurrences. 3467 fixCrossIterationPHIs(); 3468 3469 // Forget the original basic block. 3470 PSE.getSE()->forgetLoop(OrigLoop); 3471 3472 // Fix-up external users of the induction variables. 3473 for (auto &Entry : *Legal->getInductionVars()) 3474 fixupIVUsers(Entry.first, Entry.second, 3475 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3476 IVEndValues[Entry.first], LoopMiddleBlock); 3477 3478 fixLCSSAPHIs(); 3479 for (Instruction *PI : PredicatedInstructions) 3480 sinkScalarOperands(&*PI); 3481 3482 // Remove redundant induction instructions. 3483 cse(LoopVectorBody); 3484 } 3485 3486 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3487 // In order to support recurrences we need to be able to vectorize Phi nodes. 3488 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3489 // stage #2: We now need to fix the recurrences by adding incoming edges to 3490 // the currently empty PHI nodes. At this point every instruction in the 3491 // original loop is widened to a vector form so we can use them to construct 3492 // the incoming edges. 3493 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3494 // Handle first-order recurrences and reductions that need to be fixed. 3495 if (Legal->isFirstOrderRecurrence(&Phi)) 3496 fixFirstOrderRecurrence(&Phi); 3497 else if (Legal->isReductionVariable(&Phi)) 3498 fixReduction(&Phi); 3499 } 3500 } 3501 3502 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3503 // This is the second phase of vectorizing first-order recurrences. An 3504 // overview of the transformation is described below. Suppose we have the 3505 // following loop. 3506 // 3507 // for (int i = 0; i < n; ++i) 3508 // b[i] = a[i] - a[i - 1]; 3509 // 3510 // There is a first-order recurrence on "a". For this loop, the shorthand 3511 // scalar IR looks like: 3512 // 3513 // scalar.ph: 3514 // s_init = a[-1] 3515 // br scalar.body 3516 // 3517 // scalar.body: 3518 // i = phi [0, scalar.ph], [i+1, scalar.body] 3519 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3520 // s2 = a[i] 3521 // b[i] = s2 - s1 3522 // br cond, scalar.body, ... 3523 // 3524 // In this example, s1 is a recurrence because it's value depends on the 3525 // previous iteration. In the first phase of vectorization, we created a 3526 // temporary value for s1. We now complete the vectorization and produce the 3527 // shorthand vector IR shown below (for VF = 4, UF = 1). 3528 // 3529 // vector.ph: 3530 // v_init = vector(..., ..., ..., a[-1]) 3531 // br vector.body 3532 // 3533 // vector.body 3534 // i = phi [0, vector.ph], [i+4, vector.body] 3535 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3536 // v2 = a[i, i+1, i+2, i+3]; 3537 // v3 = vector(v1(3), v2(0, 1, 2)) 3538 // b[i, i+1, i+2, i+3] = v2 - v3 3539 // br cond, vector.body, middle.block 3540 // 3541 // middle.block: 3542 // x = v2(3) 3543 // br scalar.ph 3544 // 3545 // scalar.ph: 3546 // s_init = phi [x, middle.block], [a[-1], otherwise] 3547 // br scalar.body 3548 // 3549 // After execution completes the vector loop, we extract the next value of 3550 // the recurrence (x) to use as the initial value in the scalar loop. 3551 3552 // Get the original loop preheader and single loop latch. 3553 auto *Preheader = OrigLoop->getLoopPreheader(); 3554 auto *Latch = OrigLoop->getLoopLatch(); 3555 3556 // Get the initial and previous values of the scalar recurrence. 3557 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3558 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3559 3560 // Create a vector from the initial value. 3561 auto *VectorInit = ScalarInit; 3562 if (VF > 1) { 3563 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3564 VectorInit = Builder.CreateInsertElement( 3565 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3566 Builder.getInt32(VF - 1), "vector.recur.init"); 3567 } 3568 3569 // We constructed a temporary phi node in the first phase of vectorization. 3570 // This phi node will eventually be deleted. 3571 Builder.SetInsertPoint( 3572 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3573 3574 // Create a phi node for the new recurrence. The current value will either be 3575 // the initial value inserted into a vector or loop-varying vector value. 3576 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3577 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3578 3579 // Get the vectorized previous value of the last part UF - 1. It appears last 3580 // among all unrolled iterations, due to the order of their construction. 3581 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3582 3583 // Find and set the insertion point after the previous value if it is an 3584 // instruction. 3585 BasicBlock::iterator InsertPt; 3586 // Note that the previous value may have been constant-folded so it is not 3587 // guaranteed to be an instruction in the vector loop. 3588 // FIXME: Loop invariant values do not form recurrences. We should deal with 3589 // them earlier. 3590 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3591 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3592 else { 3593 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3594 if (isa<PHINode>(PreviousLastPart)) 3595 // If the previous value is a phi node, we should insert after all the phi 3596 // nodes in the block containing the PHI to avoid breaking basic block 3597 // verification. Note that the basic block may be different to 3598 // LoopVectorBody, in case we predicate the loop. 3599 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3600 else 3601 InsertPt = ++PreviousInst->getIterator(); 3602 } 3603 Builder.SetInsertPoint(&*InsertPt); 3604 3605 // We will construct a vector for the recurrence by combining the values for 3606 // the current and previous iterations. This is the required shuffle mask. 3607 SmallVector<Constant *, 8> ShuffleMask(VF); 3608 ShuffleMask[0] = Builder.getInt32(VF - 1); 3609 for (unsigned I = 1; I < VF; ++I) 3610 ShuffleMask[I] = Builder.getInt32(I + VF - 1); 3611 3612 // The vector from which to take the initial value for the current iteration 3613 // (actual or unrolled). Initially, this is the vector phi node. 3614 Value *Incoming = VecPhi; 3615 3616 // Shuffle the current and previous vector and update the vector parts. 3617 for (unsigned Part = 0; Part < UF; ++Part) { 3618 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3619 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3620 auto *Shuffle = 3621 VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3622 ConstantVector::get(ShuffleMask)) 3623 : Incoming; 3624 PhiPart->replaceAllUsesWith(Shuffle); 3625 cast<Instruction>(PhiPart)->eraseFromParent(); 3626 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3627 Incoming = PreviousPart; 3628 } 3629 3630 // Fix the latch value of the new recurrence in the vector loop. 3631 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3632 3633 // Extract the last vector element in the middle block. This will be the 3634 // initial value for the recurrence when jumping to the scalar loop. 3635 auto *ExtractForScalar = Incoming; 3636 if (VF > 1) { 3637 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3638 ExtractForScalar = Builder.CreateExtractElement( 3639 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3640 } 3641 // Extract the second last element in the middle block if the 3642 // Phi is used outside the loop. We need to extract the phi itself 3643 // and not the last element (the phi update in the current iteration). This 3644 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3645 // when the scalar loop is not run at all. 3646 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3647 if (VF > 1) 3648 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3649 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3650 // When loop is unrolled without vectorizing, initialize 3651 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3652 // `Incoming`. This is analogous to the vectorized case above: extracting the 3653 // second last element when VF > 1. 3654 else if (UF > 1) 3655 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3656 3657 // Fix the initial value of the original recurrence in the scalar loop. 3658 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3659 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3660 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3661 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3662 Start->addIncoming(Incoming, BB); 3663 } 3664 3665 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3666 Phi->setName("scalar.recur"); 3667 3668 // Finally, fix users of the recurrence outside the loop. The users will need 3669 // either the last value of the scalar recurrence or the last value of the 3670 // vector recurrence we extracted in the middle block. Since the loop is in 3671 // LCSSA form, we just need to find all the phi nodes for the original scalar 3672 // recurrence in the exit block, and then add an edge for the middle block. 3673 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3674 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3675 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3676 } 3677 } 3678 } 3679 3680 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3681 Constant *Zero = Builder.getInt32(0); 3682 3683 // Get it's reduction variable descriptor. 3684 assert(Legal->isReductionVariable(Phi) && 3685 "Unable to find the reduction variable"); 3686 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; 3687 3688 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3689 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3690 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3691 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3692 RdxDesc.getMinMaxRecurrenceKind(); 3693 setDebugLocFromInst(Builder, ReductionStartValue); 3694 3695 // We need to generate a reduction vector from the incoming scalar. 3696 // To do so, we need to generate the 'identity' vector and override 3697 // one of the elements with the incoming scalar reduction. We need 3698 // to do it in the vector-loop preheader. 3699 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3700 3701 // This is the vector-clone of the value that leaves the loop. 3702 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3703 3704 // Find the reduction identity variable. Zero for addition, or, xor, 3705 // one for multiplication, -1 for And. 3706 Value *Identity; 3707 Value *VectorStart; 3708 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3709 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3710 // MinMax reduction have the start value as their identify. 3711 if (VF == 1) { 3712 VectorStart = Identity = ReductionStartValue; 3713 } else { 3714 VectorStart = Identity = 3715 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3716 } 3717 } else { 3718 // Handle other reduction kinds: 3719 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3720 RK, VecTy->getScalarType()); 3721 if (VF == 1) { 3722 Identity = Iden; 3723 // This vector is the Identity vector where the first element is the 3724 // incoming scalar reduction. 3725 VectorStart = ReductionStartValue; 3726 } else { 3727 Identity = ConstantVector::getSplat(VF, Iden); 3728 3729 // This vector is the Identity vector where the first element is the 3730 // incoming scalar reduction. 3731 VectorStart = 3732 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3733 } 3734 } 3735 3736 // Wrap flags are in general invalid after vectorization, clear them. 3737 clearReductionWrapFlags(RdxDesc); 3738 3739 // Fix the vector-loop phi. 3740 3741 // Reductions do not have to start at zero. They can start with 3742 // any loop invariant values. 3743 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3744 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3745 3746 for (unsigned Part = 0; Part < UF; ++Part) { 3747 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3748 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3749 // Make sure to add the reduction start value only to the 3750 // first unroll part. 3751 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3752 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3753 cast<PHINode>(VecRdxPhi) 3754 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3755 } 3756 3757 // Before each round, move the insertion point right between 3758 // the PHIs and the values we are going to write. 3759 // This allows us to write both PHINodes and the extractelement 3760 // instructions. 3761 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3762 3763 setDebugLocFromInst(Builder, LoopExitInst); 3764 3765 // If tail is folded by masking, the vector value to leave the loop should be 3766 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3767 // instead of the former. 3768 if (Cost->foldTailByMasking()) { 3769 for (unsigned Part = 0; Part < UF; ++Part) { 3770 Value *VecLoopExitInst = 3771 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3772 Value *Sel = nullptr; 3773 for (User *U : VecLoopExitInst->users()) { 3774 if (isa<SelectInst>(U)) { 3775 assert(!Sel && "Reduction exit feeding two selects"); 3776 Sel = U; 3777 } else 3778 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3779 } 3780 assert(Sel && "Reduction exit feeds no select"); 3781 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3782 } 3783 } 3784 3785 // If the vector reduction can be performed in a smaller type, we truncate 3786 // then extend the loop exit value to enable InstCombine to evaluate the 3787 // entire expression in the smaller type. 3788 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3789 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3790 Builder.SetInsertPoint( 3791 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3792 VectorParts RdxParts(UF); 3793 for (unsigned Part = 0; Part < UF; ++Part) { 3794 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3795 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3796 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3797 : Builder.CreateZExt(Trunc, VecTy); 3798 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3799 UI != RdxParts[Part]->user_end();) 3800 if (*UI != Trunc) { 3801 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3802 RdxParts[Part] = Extnd; 3803 } else { 3804 ++UI; 3805 } 3806 } 3807 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3808 for (unsigned Part = 0; Part < UF; ++Part) { 3809 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3810 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3811 } 3812 } 3813 3814 // Reduce all of the unrolled parts into a single vector. 3815 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3816 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3817 3818 // The middle block terminator has already been assigned a DebugLoc here (the 3819 // OrigLoop's single latch terminator). We want the whole middle block to 3820 // appear to execute on this line because: (a) it is all compiler generated, 3821 // (b) these instructions are always executed after evaluating the latch 3822 // conditional branch, and (c) other passes may add new predecessors which 3823 // terminate on this line. This is the easiest way to ensure we don't 3824 // accidentally cause an extra step back into the loop while debugging. 3825 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3826 for (unsigned Part = 1; Part < UF; ++Part) { 3827 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3828 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3829 // Floating point operations had to be 'fast' to enable the reduction. 3830 ReducedPartRdx = addFastMathFlag( 3831 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3832 ReducedPartRdx, "bin.rdx"), 3833 RdxDesc.getFastMathFlags()); 3834 else 3835 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3836 RdxPart); 3837 } 3838 3839 if (VF > 1) { 3840 bool NoNaN = Legal->hasFunNoNaNAttr(); 3841 ReducedPartRdx = 3842 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3843 // If the reduction can be performed in a smaller type, we need to extend 3844 // the reduction to the wider type before we branch to the original loop. 3845 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3846 ReducedPartRdx = 3847 RdxDesc.isSigned() 3848 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3849 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3850 } 3851 3852 // Create a phi node that merges control-flow from the backedge-taken check 3853 // block and the middle block. 3854 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3855 LoopScalarPreHeader->getTerminator()); 3856 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3857 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3858 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3859 3860 // Now, we need to fix the users of the reduction variable 3861 // inside and outside of the scalar remainder loop. 3862 // We know that the loop is in LCSSA form. We need to update the 3863 // PHI nodes in the exit blocks. 3864 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3865 // All PHINodes need to have a single entry edge, or two if 3866 // we already fixed them. 3867 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3868 3869 // We found a reduction value exit-PHI. Update it with the 3870 // incoming bypass edge. 3871 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3872 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3873 } // end of the LCSSA phi scan. 3874 3875 // Fix the scalar loop reduction variable with the incoming reduction sum 3876 // from the vector body and from the backedge value. 3877 int IncomingEdgeBlockIdx = 3878 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3879 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3880 // Pick the other block. 3881 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3882 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3883 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3884 } 3885 3886 void InnerLoopVectorizer::clearReductionWrapFlags( 3887 RecurrenceDescriptor &RdxDesc) { 3888 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3889 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 3890 RK != RecurrenceDescriptor::RK_IntegerMult) 3891 return; 3892 3893 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 3894 assert(LoopExitInstr && "null loop exit instruction"); 3895 SmallVector<Instruction *, 8> Worklist; 3896 SmallPtrSet<Instruction *, 8> Visited; 3897 Worklist.push_back(LoopExitInstr); 3898 Visited.insert(LoopExitInstr); 3899 3900 while (!Worklist.empty()) { 3901 Instruction *Cur = Worklist.pop_back_val(); 3902 if (isa<OverflowingBinaryOperator>(Cur)) 3903 for (unsigned Part = 0; Part < UF; ++Part) { 3904 Value *V = getOrCreateVectorValue(Cur, Part); 3905 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 3906 } 3907 3908 for (User *U : Cur->users()) { 3909 Instruction *UI = cast<Instruction>(U); 3910 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 3911 Visited.insert(UI).second) 3912 Worklist.push_back(UI); 3913 } 3914 } 3915 } 3916 3917 void InnerLoopVectorizer::fixLCSSAPHIs() { 3918 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3919 if (LCSSAPhi.getNumIncomingValues() == 1) { 3920 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3921 // Non-instruction incoming values will have only one value. 3922 unsigned LastLane = 0; 3923 if (isa<Instruction>(IncomingValue)) 3924 LastLane = Cost->isUniformAfterVectorization( 3925 cast<Instruction>(IncomingValue), VF) 3926 ? 0 3927 : VF - 1; 3928 // Can be a loop invariant incoming value or the last scalar value to be 3929 // extracted from the vectorized loop. 3930 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3931 Value *lastIncomingValue = 3932 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3933 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3934 } 3935 } 3936 } 3937 3938 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3939 // The basic block and loop containing the predicated instruction. 3940 auto *PredBB = PredInst->getParent(); 3941 auto *VectorLoop = LI->getLoopFor(PredBB); 3942 3943 // Initialize a worklist with the operands of the predicated instruction. 3944 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3945 3946 // Holds instructions that we need to analyze again. An instruction may be 3947 // reanalyzed if we don't yet know if we can sink it or not. 3948 SmallVector<Instruction *, 8> InstsToReanalyze; 3949 3950 // Returns true if a given use occurs in the predicated block. Phi nodes use 3951 // their operands in their corresponding predecessor blocks. 3952 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3953 auto *I = cast<Instruction>(U.getUser()); 3954 BasicBlock *BB = I->getParent(); 3955 if (auto *Phi = dyn_cast<PHINode>(I)) 3956 BB = Phi->getIncomingBlock( 3957 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3958 return BB == PredBB; 3959 }; 3960 3961 // Iteratively sink the scalarized operands of the predicated instruction 3962 // into the block we created for it. When an instruction is sunk, it's 3963 // operands are then added to the worklist. The algorithm ends after one pass 3964 // through the worklist doesn't sink a single instruction. 3965 bool Changed; 3966 do { 3967 // Add the instructions that need to be reanalyzed to the worklist, and 3968 // reset the changed indicator. 3969 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3970 InstsToReanalyze.clear(); 3971 Changed = false; 3972 3973 while (!Worklist.empty()) { 3974 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3975 3976 // We can't sink an instruction if it is a phi node, is already in the 3977 // predicated block, is not in the loop, or may have side effects. 3978 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 3979 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 3980 continue; 3981 3982 // It's legal to sink the instruction if all its uses occur in the 3983 // predicated block. Otherwise, there's nothing to do yet, and we may 3984 // need to reanalyze the instruction. 3985 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 3986 InstsToReanalyze.push_back(I); 3987 continue; 3988 } 3989 3990 // Move the instruction to the beginning of the predicated block, and add 3991 // it's operands to the worklist. 3992 I->moveBefore(&*PredBB->getFirstInsertionPt()); 3993 Worklist.insert(I->op_begin(), I->op_end()); 3994 3995 // The sinking may have enabled other instructions to be sunk, so we will 3996 // need to iterate. 3997 Changed = true; 3998 } 3999 } while (Changed); 4000 } 4001 4002 void InnerLoopVectorizer::fixNonInductionPHIs() { 4003 for (PHINode *OrigPhi : OrigPHIsToFix) { 4004 PHINode *NewPhi = 4005 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4006 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4007 4008 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4009 predecessors(OrigPhi->getParent())); 4010 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4011 predecessors(NewPhi->getParent())); 4012 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4013 "Scalar and Vector BB should have the same number of predecessors"); 4014 4015 // The insertion point in Builder may be invalidated by the time we get 4016 // here. Force the Builder insertion point to something valid so that we do 4017 // not run into issues during insertion point restore in 4018 // getOrCreateVectorValue calls below. 4019 Builder.SetInsertPoint(NewPhi); 4020 4021 // The predecessor order is preserved and we can rely on mapping between 4022 // scalar and vector block predecessors. 4023 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4024 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4025 4026 // When looking up the new scalar/vector values to fix up, use incoming 4027 // values from original phi. 4028 Value *ScIncV = 4029 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4030 4031 // Scalar incoming value may need a broadcast 4032 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4033 NewPhi->addIncoming(NewIncV, NewPredBB); 4034 } 4035 } 4036 } 4037 4038 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, 4039 unsigned VF, bool IsPtrLoopInvariant, 4040 SmallBitVector &IsIndexLoopInvariant) { 4041 // Construct a vector GEP by widening the operands of the scalar GEP as 4042 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4043 // results in a vector of pointers when at least one operand of the GEP 4044 // is vector-typed. Thus, to keep the representation compact, we only use 4045 // vector-typed operands for loop-varying values. 4046 4047 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4048 // If we are vectorizing, but the GEP has only loop-invariant operands, 4049 // the GEP we build (by only using vector-typed operands for 4050 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4051 // produce a vector of pointers, we need to either arbitrarily pick an 4052 // operand to broadcast, or broadcast a clone of the original GEP. 4053 // Here, we broadcast a clone of the original. 4054 // 4055 // TODO: If at some point we decide to scalarize instructions having 4056 // loop-invariant operands, this special case will no longer be 4057 // required. We would add the scalarization decision to 4058 // collectLoopScalars() and teach getVectorValue() to broadcast 4059 // the lane-zero scalar value. 4060 auto *Clone = Builder.Insert(GEP->clone()); 4061 for (unsigned Part = 0; Part < UF; ++Part) { 4062 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4063 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4064 addMetadata(EntryPart, GEP); 4065 } 4066 } else { 4067 // If the GEP has at least one loop-varying operand, we are sure to 4068 // produce a vector of pointers. But if we are only unrolling, we want 4069 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4070 // produce with the code below will be scalar (if VF == 1) or vector 4071 // (otherwise). Note that for the unroll-only case, we still maintain 4072 // values in the vector mapping with initVector, as we do for other 4073 // instructions. 4074 for (unsigned Part = 0; Part < UF; ++Part) { 4075 // The pointer operand of the new GEP. If it's loop-invariant, we 4076 // won't broadcast it. 4077 auto *Ptr = IsPtrLoopInvariant 4078 ? GEP->getPointerOperand() 4079 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4080 4081 // Collect all the indices for the new GEP. If any index is 4082 // loop-invariant, we won't broadcast it. 4083 SmallVector<Value *, 4> Indices; 4084 for (auto Index : enumerate(GEP->indices())) { 4085 Value *User = Index.value().get(); 4086 if (IsIndexLoopInvariant[Index.index()]) 4087 Indices.push_back(User); 4088 else 4089 Indices.push_back(getOrCreateVectorValue(User, Part)); 4090 } 4091 4092 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4093 // but it should be a vector, otherwise. 4094 auto *NewGEP = 4095 GEP->isInBounds() 4096 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4097 Indices) 4098 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4099 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4100 "NewGEP is not a pointer vector"); 4101 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4102 addMetadata(NewGEP, GEP); 4103 } 4104 } 4105 } 4106 4107 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4108 unsigned VF) { 4109 PHINode *P = cast<PHINode>(PN); 4110 if (EnableVPlanNativePath) { 4111 // Currently we enter here in the VPlan-native path for non-induction 4112 // PHIs where all control flow is uniform. We simply widen these PHIs. 4113 // Create a vector phi with no operands - the vector phi operands will be 4114 // set at the end of vector code generation. 4115 Type *VecTy = 4116 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4117 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4118 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4119 OrigPHIsToFix.push_back(P); 4120 4121 return; 4122 } 4123 4124 assert(PN->getParent() == OrigLoop->getHeader() && 4125 "Non-header phis should have been handled elsewhere"); 4126 4127 // In order to support recurrences we need to be able to vectorize Phi nodes. 4128 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4129 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4130 // this value when we vectorize all of the instructions that use the PHI. 4131 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4132 for (unsigned Part = 0; Part < UF; ++Part) { 4133 // This is phase one of vectorizing PHIs. 4134 Type *VecTy = 4135 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4136 Value *EntryPart = PHINode::Create( 4137 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4138 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4139 } 4140 return; 4141 } 4142 4143 setDebugLocFromInst(Builder, P); 4144 4145 // This PHINode must be an induction variable. 4146 // Make sure that we know about it. 4147 assert(Legal->getInductionVars()->count(P) && "Not an induction variable"); 4148 4149 InductionDescriptor II = Legal->getInductionVars()->lookup(P); 4150 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4151 4152 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4153 // which can be found from the original scalar operations. 4154 switch (II.getKind()) { 4155 case InductionDescriptor::IK_NoInduction: 4156 llvm_unreachable("Unknown induction"); 4157 case InductionDescriptor::IK_IntInduction: 4158 case InductionDescriptor::IK_FpInduction: 4159 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4160 case InductionDescriptor::IK_PtrInduction: { 4161 // Handle the pointer induction variable case. 4162 assert(P->getType()->isPointerTy() && "Unexpected type."); 4163 // This is the normalized GEP that starts counting at zero. 4164 Value *PtrInd = Induction; 4165 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 4166 // Determine the number of scalars we need to generate for each unroll 4167 // iteration. If the instruction is uniform, we only need to generate the 4168 // first lane. Otherwise, we generate all VF values. 4169 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4170 // These are the scalar results. Notice that we don't generate vector GEPs 4171 // because scalar GEPs result in better code. 4172 for (unsigned Part = 0; Part < UF; ++Part) { 4173 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4174 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4175 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4176 Value *SclrGep = 4177 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4178 SclrGep->setName("next.gep"); 4179 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4180 } 4181 } 4182 return; 4183 } 4184 } 4185 } 4186 4187 /// A helper function for checking whether an integer division-related 4188 /// instruction may divide by zero (in which case it must be predicated if 4189 /// executed conditionally in the scalar code). 4190 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4191 /// Non-zero divisors that are non compile-time constants will not be 4192 /// converted into multiplication, so we will still end up scalarizing 4193 /// the division, but can do so w/o predication. 4194 static bool mayDivideByZero(Instruction &I) { 4195 assert((I.getOpcode() == Instruction::UDiv || 4196 I.getOpcode() == Instruction::SDiv || 4197 I.getOpcode() == Instruction::URem || 4198 I.getOpcode() == Instruction::SRem) && 4199 "Unexpected instruction"); 4200 Value *Divisor = I.getOperand(1); 4201 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4202 return !CInt || CInt->isZero(); 4203 } 4204 4205 void InnerLoopVectorizer::widenInstruction(Instruction &I) { 4206 switch (I.getOpcode()) { 4207 case Instruction::Br: 4208 case Instruction::PHI: 4209 case Instruction::GetElementPtr: 4210 llvm_unreachable("This instruction is handled by a different recipe."); 4211 case Instruction::UDiv: 4212 case Instruction::SDiv: 4213 case Instruction::SRem: 4214 case Instruction::URem: 4215 case Instruction::Add: 4216 case Instruction::FAdd: 4217 case Instruction::Sub: 4218 case Instruction::FSub: 4219 case Instruction::FNeg: 4220 case Instruction::Mul: 4221 case Instruction::FMul: 4222 case Instruction::FDiv: 4223 case Instruction::FRem: 4224 case Instruction::Shl: 4225 case Instruction::LShr: 4226 case Instruction::AShr: 4227 case Instruction::And: 4228 case Instruction::Or: 4229 case Instruction::Xor: { 4230 // Just widen unops and binops. 4231 setDebugLocFromInst(Builder, &I); 4232 4233 for (unsigned Part = 0; Part < UF; ++Part) { 4234 SmallVector<Value *, 2> Ops; 4235 for (Value *Op : I.operands()) 4236 Ops.push_back(getOrCreateVectorValue(Op, Part)); 4237 4238 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4239 4240 if (auto *VecOp = dyn_cast<Instruction>(V)) 4241 VecOp->copyIRFlags(&I); 4242 4243 // Use this vector value for all users of the original instruction. 4244 VectorLoopValueMap.setVectorValue(&I, Part, V); 4245 addMetadata(V, &I); 4246 } 4247 4248 break; 4249 } 4250 case Instruction::Select: { 4251 // Widen selects. 4252 // If the selector is loop invariant we can create a select 4253 // instruction with a scalar condition. Otherwise, use vector-select. 4254 auto *SE = PSE.getSE(); 4255 bool InvariantCond = 4256 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); 4257 setDebugLocFromInst(Builder, &I); 4258 4259 // The condition can be loop invariant but still defined inside the 4260 // loop. This means that we can't just use the original 'cond' value. 4261 // We have to take the 'vectorized' value and pick the first lane. 4262 // Instcombine will make this a no-op. 4263 4264 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); 4265 4266 for (unsigned Part = 0; Part < UF; ++Part) { 4267 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); 4268 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); 4269 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); 4270 Value *Sel = 4271 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); 4272 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4273 addMetadata(Sel, &I); 4274 } 4275 4276 break; 4277 } 4278 4279 case Instruction::ICmp: 4280 case Instruction::FCmp: { 4281 // Widen compares. Generate vector compares. 4282 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4283 auto *Cmp = cast<CmpInst>(&I); 4284 setDebugLocFromInst(Builder, Cmp); 4285 for (unsigned Part = 0; Part < UF; ++Part) { 4286 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); 4287 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); 4288 Value *C = nullptr; 4289 if (FCmp) { 4290 // Propagate fast math flags. 4291 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4292 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4293 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4294 } else { 4295 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4296 } 4297 VectorLoopValueMap.setVectorValue(&I, Part, C); 4298 addMetadata(C, &I); 4299 } 4300 4301 break; 4302 } 4303 4304 case Instruction::ZExt: 4305 case Instruction::SExt: 4306 case Instruction::FPToUI: 4307 case Instruction::FPToSI: 4308 case Instruction::FPExt: 4309 case Instruction::PtrToInt: 4310 case Instruction::IntToPtr: 4311 case Instruction::SIToFP: 4312 case Instruction::UIToFP: 4313 case Instruction::Trunc: 4314 case Instruction::FPTrunc: 4315 case Instruction::BitCast: { 4316 auto *CI = cast<CastInst>(&I); 4317 setDebugLocFromInst(Builder, CI); 4318 4319 /// Vectorize casts. 4320 Type *DestTy = 4321 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4322 4323 for (unsigned Part = 0; Part < UF; ++Part) { 4324 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); 4325 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4326 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4327 addMetadata(Cast, &I); 4328 } 4329 break; 4330 } 4331 4332 case Instruction::Call: { 4333 // Ignore dbg intrinsics. 4334 if (isa<DbgInfoIntrinsic>(I)) 4335 break; 4336 setDebugLocFromInst(Builder, &I); 4337 4338 Module *M = I.getParent()->getParent()->getParent(); 4339 auto *CI = cast<CallInst>(&I); 4340 4341 StringRef FnName = CI->getCalledFunction()->getName(); 4342 Function *F = CI->getCalledFunction(); 4343 Type *RetTy = ToVectorTy(CI->getType(), VF); 4344 SmallVector<Type *, 4> Tys; 4345 for (Value *ArgOperand : CI->arg_operands()) 4346 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4347 4348 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4349 4350 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4351 // version of the instruction. 4352 // Is it beneficial to perform intrinsic call compared to lib call? 4353 bool NeedToScalarize; 4354 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4355 bool UseVectorIntrinsic = 4356 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4357 assert((UseVectorIntrinsic || !NeedToScalarize) && 4358 "Instruction should be scalarized elsewhere."); 4359 4360 for (unsigned Part = 0; Part < UF; ++Part) { 4361 SmallVector<Value *, 4> Args; 4362 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 4363 Value *Arg = CI->getArgOperand(i); 4364 // Some intrinsics have a scalar argument - don't replace it with a 4365 // vector. 4366 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) 4367 Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); 4368 Args.push_back(Arg); 4369 } 4370 4371 Function *VectorF; 4372 if (UseVectorIntrinsic) { 4373 // Use vector version of the intrinsic. 4374 Type *TysForDecl[] = {CI->getType()}; 4375 if (VF > 1) 4376 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4377 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4378 } else { 4379 // Use vector version of the library call. 4380 StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); 4381 assert(!VFnName.empty() && "Vector function name is empty."); 4382 VectorF = M->getFunction(VFnName); 4383 if (!VectorF) { 4384 // Generate a declaration 4385 FunctionType *FTy = FunctionType::get(RetTy, Tys, false); 4386 VectorF = 4387 Function::Create(FTy, Function::ExternalLinkage, VFnName, M); 4388 VectorF->copyAttributesFrom(F); 4389 } 4390 } 4391 assert(VectorF && "Can't create vector function."); 4392 4393 SmallVector<OperandBundleDef, 1> OpBundles; 4394 CI->getOperandBundlesAsDefs(OpBundles); 4395 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4396 4397 if (isa<FPMathOperator>(V)) 4398 V->copyFastMathFlags(CI); 4399 4400 VectorLoopValueMap.setVectorValue(&I, Part, V); 4401 addMetadata(V, &I); 4402 } 4403 4404 break; 4405 } 4406 4407 default: 4408 // This instruction is not vectorized by simple widening. 4409 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4410 llvm_unreachable("Unhandled instruction!"); 4411 } // end of switch. 4412 } 4413 4414 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4415 // We should not collect Scalars more than once per VF. Right now, this 4416 // function is called from collectUniformsAndScalars(), which already does 4417 // this check. Collecting Scalars for VF=1 does not make any sense. 4418 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4419 "This function should not be visited twice for the same VF"); 4420 4421 SmallSetVector<Instruction *, 8> Worklist; 4422 4423 // These sets are used to seed the analysis with pointers used by memory 4424 // accesses that will remain scalar. 4425 SmallSetVector<Instruction *, 8> ScalarPtrs; 4426 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4427 4428 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4429 // The pointer operands of loads and stores will be scalar as long as the 4430 // memory access is not a gather or scatter operation. The value operand of a 4431 // store will remain scalar if the store is scalarized. 4432 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4433 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4434 assert(WideningDecision != CM_Unknown && 4435 "Widening decision should be ready at this moment"); 4436 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4437 if (Ptr == Store->getValueOperand()) 4438 return WideningDecision == CM_Scalarize; 4439 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4440 "Ptr is neither a value or pointer operand"); 4441 return WideningDecision != CM_GatherScatter; 4442 }; 4443 4444 // A helper that returns true if the given value is a bitcast or 4445 // getelementptr instruction contained in the loop. 4446 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4447 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4448 isa<GetElementPtrInst>(V)) && 4449 !TheLoop->isLoopInvariant(V); 4450 }; 4451 4452 // A helper that evaluates a memory access's use of a pointer. If the use 4453 // will be a scalar use, and the pointer is only used by memory accesses, we 4454 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4455 // PossibleNonScalarPtrs. 4456 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4457 // We only care about bitcast and getelementptr instructions contained in 4458 // the loop. 4459 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4460 return; 4461 4462 // If the pointer has already been identified as scalar (e.g., if it was 4463 // also identified as uniform), there's nothing to do. 4464 auto *I = cast<Instruction>(Ptr); 4465 if (Worklist.count(I)) 4466 return; 4467 4468 // If the use of the pointer will be a scalar use, and all users of the 4469 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4470 // place the pointer in PossibleNonScalarPtrs. 4471 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4472 return isa<LoadInst>(U) || isa<StoreInst>(U); 4473 })) 4474 ScalarPtrs.insert(I); 4475 else 4476 PossibleNonScalarPtrs.insert(I); 4477 }; 4478 4479 // We seed the scalars analysis with three classes of instructions: (1) 4480 // instructions marked uniform-after-vectorization, (2) bitcast and 4481 // getelementptr instructions used by memory accesses requiring a scalar use, 4482 // and (3) pointer induction variables and their update instructions (we 4483 // currently only scalarize these). 4484 // 4485 // (1) Add to the worklist all instructions that have been identified as 4486 // uniform-after-vectorization. 4487 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4488 4489 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4490 // memory accesses requiring a scalar use. The pointer operands of loads and 4491 // stores will be scalar as long as the memory accesses is not a gather or 4492 // scatter operation. The value operand of a store will remain scalar if the 4493 // store is scalarized. 4494 for (auto *BB : TheLoop->blocks()) 4495 for (auto &I : *BB) { 4496 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4497 evaluatePtrUse(Load, Load->getPointerOperand()); 4498 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4499 evaluatePtrUse(Store, Store->getPointerOperand()); 4500 evaluatePtrUse(Store, Store->getValueOperand()); 4501 } 4502 } 4503 for (auto *I : ScalarPtrs) 4504 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { 4505 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4506 Worklist.insert(I); 4507 } 4508 4509 // (3) Add to the worklist all pointer induction variables and their update 4510 // instructions. 4511 // 4512 // TODO: Once we are able to vectorize pointer induction variables we should 4513 // no longer insert them into the worklist here. 4514 auto *Latch = TheLoop->getLoopLatch(); 4515 for (auto &Induction : *Legal->getInductionVars()) { 4516 auto *Ind = Induction.first; 4517 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4518 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4519 continue; 4520 Worklist.insert(Ind); 4521 Worklist.insert(IndUpdate); 4522 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4523 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4524 << "\n"); 4525 } 4526 4527 // Insert the forced scalars. 4528 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4529 // induction variable when the PHI user is scalarized. 4530 auto ForcedScalar = ForcedScalars.find(VF); 4531 if (ForcedScalar != ForcedScalars.end()) 4532 for (auto *I : ForcedScalar->second) 4533 Worklist.insert(I); 4534 4535 // Expand the worklist by looking through any bitcasts and getelementptr 4536 // instructions we've already identified as scalar. This is similar to the 4537 // expansion step in collectLoopUniforms(); however, here we're only 4538 // expanding to include additional bitcasts and getelementptr instructions. 4539 unsigned Idx = 0; 4540 while (Idx != Worklist.size()) { 4541 Instruction *Dst = Worklist[Idx++]; 4542 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4543 continue; 4544 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4545 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4546 auto *J = cast<Instruction>(U); 4547 return !TheLoop->contains(J) || Worklist.count(J) || 4548 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4549 isScalarUse(J, Src)); 4550 })) { 4551 Worklist.insert(Src); 4552 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4553 } 4554 } 4555 4556 // An induction variable will remain scalar if all users of the induction 4557 // variable and induction variable update remain scalar. 4558 for (auto &Induction : *Legal->getInductionVars()) { 4559 auto *Ind = Induction.first; 4560 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4561 4562 // We already considered pointer induction variables, so there's no reason 4563 // to look at their users again. 4564 // 4565 // TODO: Once we are able to vectorize pointer induction variables we 4566 // should no longer skip over them here. 4567 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4568 continue; 4569 4570 // Determine if all users of the induction variable are scalar after 4571 // vectorization. 4572 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4573 auto *I = cast<Instruction>(U); 4574 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4575 }); 4576 if (!ScalarInd) 4577 continue; 4578 4579 // Determine if all users of the induction variable update instruction are 4580 // scalar after vectorization. 4581 auto ScalarIndUpdate = 4582 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4583 auto *I = cast<Instruction>(U); 4584 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4585 }); 4586 if (!ScalarIndUpdate) 4587 continue; 4588 4589 // The induction variable and its update instruction will remain scalar. 4590 Worklist.insert(Ind); 4591 Worklist.insert(IndUpdate); 4592 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4593 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4594 << "\n"); 4595 } 4596 4597 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4598 } 4599 4600 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4601 if (!blockNeedsPredication(I->getParent())) 4602 return false; 4603 switch(I->getOpcode()) { 4604 default: 4605 break; 4606 case Instruction::Load: 4607 case Instruction::Store: { 4608 if (!Legal->isMaskRequired(I)) 4609 return false; 4610 auto *Ptr = getLoadStorePointerOperand(I); 4611 auto *Ty = getMemInstValueType(I); 4612 // We have already decided how to vectorize this instruction, get that 4613 // result. 4614 if (VF > 1) { 4615 InstWidening WideningDecision = getWideningDecision(I, VF); 4616 assert(WideningDecision != CM_Unknown && 4617 "Widening decision should be ready at this moment"); 4618 return WideningDecision == CM_Scalarize; 4619 } 4620 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4621 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4622 isLegalMaskedGather(Ty, Alignment)) 4623 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4624 isLegalMaskedScatter(Ty, Alignment)); 4625 } 4626 case Instruction::UDiv: 4627 case Instruction::SDiv: 4628 case Instruction::SRem: 4629 case Instruction::URem: 4630 return mayDivideByZero(*I); 4631 } 4632 return false; 4633 } 4634 4635 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4636 unsigned VF) { 4637 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4638 assert(getWideningDecision(I, VF) == CM_Unknown && 4639 "Decision should not be set yet."); 4640 auto *Group = getInterleavedAccessGroup(I); 4641 assert(Group && "Must have a group."); 4642 4643 // If the instruction's allocated size doesn't equal it's type size, it 4644 // requires padding and will be scalarized. 4645 auto &DL = I->getModule()->getDataLayout(); 4646 auto *ScalarTy = getMemInstValueType(I); 4647 if (hasIrregularType(ScalarTy, DL, VF)) 4648 return false; 4649 4650 // Check if masking is required. 4651 // A Group may need masking for one of two reasons: it resides in a block that 4652 // needs predication, or it was decided to use masking to deal with gaps. 4653 bool PredicatedAccessRequiresMasking = 4654 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4655 bool AccessWithGapsRequiresMasking = 4656 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4657 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4658 return true; 4659 4660 // If masked interleaving is required, we expect that the user/target had 4661 // enabled it, because otherwise it either wouldn't have been created or 4662 // it should have been invalidated by the CostModel. 4663 assert(useMaskedInterleavedAccesses(TTI) && 4664 "Masked interleave-groups for predicated accesses are not enabled."); 4665 4666 auto *Ty = getMemInstValueType(I); 4667 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4668 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4669 : TTI.isLegalMaskedStore(Ty, Alignment); 4670 } 4671 4672 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4673 unsigned VF) { 4674 // Get and ensure we have a valid memory instruction. 4675 LoadInst *LI = dyn_cast<LoadInst>(I); 4676 StoreInst *SI = dyn_cast<StoreInst>(I); 4677 assert((LI || SI) && "Invalid memory instruction"); 4678 4679 auto *Ptr = getLoadStorePointerOperand(I); 4680 4681 // In order to be widened, the pointer should be consecutive, first of all. 4682 if (!Legal->isConsecutivePtr(Ptr)) 4683 return false; 4684 4685 // If the instruction is a store located in a predicated block, it will be 4686 // scalarized. 4687 if (isScalarWithPredication(I)) 4688 return false; 4689 4690 // If the instruction's allocated size doesn't equal it's type size, it 4691 // requires padding and will be scalarized. 4692 auto &DL = I->getModule()->getDataLayout(); 4693 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4694 if (hasIrregularType(ScalarTy, DL, VF)) 4695 return false; 4696 4697 return true; 4698 } 4699 4700 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4701 // We should not collect Uniforms more than once per VF. Right now, 4702 // this function is called from collectUniformsAndScalars(), which 4703 // already does this check. Collecting Uniforms for VF=1 does not make any 4704 // sense. 4705 4706 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4707 "This function should not be visited twice for the same VF"); 4708 4709 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4710 // not analyze again. Uniforms.count(VF) will return 1. 4711 Uniforms[VF].clear(); 4712 4713 // We now know that the loop is vectorizable! 4714 // Collect instructions inside the loop that will remain uniform after 4715 // vectorization. 4716 4717 // Global values, params and instructions outside of current loop are out of 4718 // scope. 4719 auto isOutOfScope = [&](Value *V) -> bool { 4720 Instruction *I = dyn_cast<Instruction>(V); 4721 return (!I || !TheLoop->contains(I)); 4722 }; 4723 4724 SetVector<Instruction *> Worklist; 4725 BasicBlock *Latch = TheLoop->getLoopLatch(); 4726 4727 // Instructions that are scalar with predication must not be considered 4728 // uniform after vectorization, because that would create an erroneous 4729 // replicating region where only a single instance out of VF should be formed. 4730 // TODO: optimize such seldom cases if found important, see PR40816. 4731 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4732 if (isScalarWithPredication(I, VF)) { 4733 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4734 << *I << "\n"); 4735 return; 4736 } 4737 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4738 Worklist.insert(I); 4739 }; 4740 4741 // Start with the conditional branch. If the branch condition is an 4742 // instruction contained in the loop that is only used by the branch, it is 4743 // uniform. 4744 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4745 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4746 addToWorklistIfAllowed(Cmp); 4747 4748 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4749 // are pointers that are treated like consecutive pointers during 4750 // vectorization. The pointer operands of interleaved accesses are an 4751 // example. 4752 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4753 4754 // Holds pointer operands of instructions that are possibly non-uniform. 4755 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4756 4757 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4758 InstWidening WideningDecision = getWideningDecision(I, VF); 4759 assert(WideningDecision != CM_Unknown && 4760 "Widening decision should be ready at this moment"); 4761 4762 return (WideningDecision == CM_Widen || 4763 WideningDecision == CM_Widen_Reverse || 4764 WideningDecision == CM_Interleave); 4765 }; 4766 // Iterate over the instructions in the loop, and collect all 4767 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4768 // that a consecutive-like pointer operand will be scalarized, we collect it 4769 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4770 // getelementptr instruction can be used by both vectorized and scalarized 4771 // memory instructions. For example, if a loop loads and stores from the same 4772 // location, but the store is conditional, the store will be scalarized, and 4773 // the getelementptr won't remain uniform. 4774 for (auto *BB : TheLoop->blocks()) 4775 for (auto &I : *BB) { 4776 // If there's no pointer operand, there's nothing to do. 4777 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4778 if (!Ptr) 4779 continue; 4780 4781 // True if all users of Ptr are memory accesses that have Ptr as their 4782 // pointer operand. 4783 auto UsersAreMemAccesses = 4784 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4785 return getLoadStorePointerOperand(U) == Ptr; 4786 }); 4787 4788 // Ensure the memory instruction will not be scalarized or used by 4789 // gather/scatter, making its pointer operand non-uniform. If the pointer 4790 // operand is used by any instruction other than a memory access, we 4791 // conservatively assume the pointer operand may be non-uniform. 4792 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4793 PossibleNonUniformPtrs.insert(Ptr); 4794 4795 // If the memory instruction will be vectorized and its pointer operand 4796 // is consecutive-like, or interleaving - the pointer operand should 4797 // remain uniform. 4798 else 4799 ConsecutiveLikePtrs.insert(Ptr); 4800 } 4801 4802 // Add to the Worklist all consecutive and consecutive-like pointers that 4803 // aren't also identified as possibly non-uniform. 4804 for (auto *V : ConsecutiveLikePtrs) 4805 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) 4806 addToWorklistIfAllowed(V); 4807 4808 // Expand Worklist in topological order: whenever a new instruction 4809 // is added , its users should be already inside Worklist. It ensures 4810 // a uniform instruction will only be used by uniform instructions. 4811 unsigned idx = 0; 4812 while (idx != Worklist.size()) { 4813 Instruction *I = Worklist[idx++]; 4814 4815 for (auto OV : I->operand_values()) { 4816 // isOutOfScope operands cannot be uniform instructions. 4817 if (isOutOfScope(OV)) 4818 continue; 4819 // First order recurrence Phi's should typically be considered 4820 // non-uniform. 4821 auto *OP = dyn_cast<PHINode>(OV); 4822 if (OP && Legal->isFirstOrderRecurrence(OP)) 4823 continue; 4824 // If all the users of the operand are uniform, then add the 4825 // operand into the uniform worklist. 4826 auto *OI = cast<Instruction>(OV); 4827 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4828 auto *J = cast<Instruction>(U); 4829 return Worklist.count(J) || 4830 (OI == getLoadStorePointerOperand(J) && 4831 isUniformDecision(J, VF)); 4832 })) 4833 addToWorklistIfAllowed(OI); 4834 } 4835 } 4836 4837 // Returns true if Ptr is the pointer operand of a memory access instruction 4838 // I, and I is known to not require scalarization. 4839 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4840 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4841 }; 4842 4843 // For an instruction to be added into Worklist above, all its users inside 4844 // the loop should also be in Worklist. However, this condition cannot be 4845 // true for phi nodes that form a cyclic dependence. We must process phi 4846 // nodes separately. An induction variable will remain uniform if all users 4847 // of the induction variable and induction variable update remain uniform. 4848 // The code below handles both pointer and non-pointer induction variables. 4849 for (auto &Induction : *Legal->getInductionVars()) { 4850 auto *Ind = Induction.first; 4851 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4852 4853 // Determine if all users of the induction variable are uniform after 4854 // vectorization. 4855 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4856 auto *I = cast<Instruction>(U); 4857 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4858 isVectorizedMemAccessUse(I, Ind); 4859 }); 4860 if (!UniformInd) 4861 continue; 4862 4863 // Determine if all users of the induction variable update instruction are 4864 // uniform after vectorization. 4865 auto UniformIndUpdate = 4866 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4867 auto *I = cast<Instruction>(U); 4868 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4869 isVectorizedMemAccessUse(I, IndUpdate); 4870 }); 4871 if (!UniformIndUpdate) 4872 continue; 4873 4874 // The induction variable and its update instruction will remain uniform. 4875 addToWorklistIfAllowed(Ind); 4876 addToWorklistIfAllowed(IndUpdate); 4877 } 4878 4879 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4880 } 4881 4882 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4883 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4884 4885 if (Legal->getRuntimePointerChecking()->Need) { 4886 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4887 "runtime pointer checks needed. Enable vectorization of this " 4888 "loop with '#pragma clang loop vectorize(enable)' when " 4889 "compiling with -Os/-Oz", 4890 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4891 return true; 4892 } 4893 4894 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4895 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4896 "runtime SCEV checks needed. Enable vectorization of this " 4897 "loop with '#pragma clang loop vectorize(enable)' when " 4898 "compiling with -Os/-Oz", 4899 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4900 return true; 4901 } 4902 4903 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4904 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4905 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4906 "runtime stride == 1 checks needed. Enable vectorization of " 4907 "this loop with '#pragma clang loop vectorize(enable)' when " 4908 "compiling with -Os/-Oz", 4909 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4910 return true; 4911 } 4912 4913 return false; 4914 } 4915 4916 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { 4917 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4918 // TODO: It may by useful to do since it's still likely to be dynamically 4919 // uniform if the target can skip. 4920 reportVectorizationFailure( 4921 "Not inserting runtime ptr check for divergent target", 4922 "runtime pointer checks needed. Not enabled for divergent target", 4923 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4924 return None; 4925 } 4926 4927 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4928 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4929 if (TC == 1) { 4930 reportVectorizationFailure("Single iteration (non) loop", 4931 "loop trip count is one, irrelevant for vectorization", 4932 "SingleIterationLoop", ORE, TheLoop); 4933 return None; 4934 } 4935 4936 switch (ScalarEpilogueStatus) { 4937 case CM_ScalarEpilogueAllowed: 4938 return computeFeasibleMaxVF(TC); 4939 case CM_ScalarEpilogueNotNeededUsePredicate: 4940 LLVM_DEBUG( 4941 dbgs() << "LV: vector predicate hint/switch found.\n" 4942 << "LV: Not allowing scalar epilogue, creating predicated " 4943 << "vector loop.\n"); 4944 break; 4945 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4946 // fallthrough as a special case of OptForSize 4947 case CM_ScalarEpilogueNotAllowedOptSize: 4948 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4949 LLVM_DEBUG( 4950 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4951 else 4952 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4953 << "count.\n"); 4954 4955 // Bail if runtime checks are required, which are not good when optimising 4956 // for size. 4957 if (runtimeChecksRequired()) 4958 return None; 4959 break; 4960 } 4961 4962 // Now try the tail folding 4963 4964 // Invalidate interleave groups that require an epilogue if we can't mask 4965 // the interleave-group. 4966 if (!useMaskedInterleavedAccesses(TTI)) 4967 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4968 4969 unsigned MaxVF = computeFeasibleMaxVF(TC); 4970 if (TC > 0 && TC % MaxVF == 0) { 4971 // Accept MaxVF if we do not have a tail. 4972 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4973 return MaxVF; 4974 } 4975 4976 // If we don't know the precise trip count, or if the trip count that we 4977 // found modulo the vectorization factor is not zero, try to fold the tail 4978 // by masking. 4979 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4980 if (Legal->prepareToFoldTailByMasking()) { 4981 FoldTailByMasking = true; 4982 return MaxVF; 4983 } 4984 4985 if (TC == 0) { 4986 reportVectorizationFailure( 4987 "Unable to calculate the loop count due to complex control flow", 4988 "unable to calculate the loop count due to complex control flow", 4989 "UnknownLoopCountComplexCFG", ORE, TheLoop); 4990 return None; 4991 } 4992 4993 reportVectorizationFailure( 4994 "Cannot optimize for size and vectorize at the same time.", 4995 "cannot optimize for size and vectorize at the same time. " 4996 "Enable vectorization of this loop with '#pragma clang loop " 4997 "vectorize(enable)' when compiling with -Os/-Oz", 4998 "NoTailLoopWithOptForSize", ORE, TheLoop); 4999 return None; 5000 } 5001 5002 unsigned 5003 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5004 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5005 unsigned SmallestType, WidestType; 5006 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5007 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5008 5009 // Get the maximum safe dependence distance in bits computed by LAA. 5010 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5011 // the memory accesses that is most restrictive (involved in the smallest 5012 // dependence distance). 5013 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5014 5015 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5016 5017 unsigned MaxVectorSize = WidestRegister / WidestType; 5018 5019 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5020 << " / " << WidestType << " bits.\n"); 5021 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5022 << WidestRegister << " bits.\n"); 5023 5024 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5025 " into one vector!"); 5026 if (MaxVectorSize == 0) { 5027 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5028 MaxVectorSize = 1; 5029 return MaxVectorSize; 5030 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5031 isPowerOf2_32(ConstTripCount)) { 5032 // We need to clamp the VF to be the ConstTripCount. There is no point in 5033 // choosing a higher viable VF as done in the loop below. 5034 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5035 << ConstTripCount << "\n"); 5036 MaxVectorSize = ConstTripCount; 5037 return MaxVectorSize; 5038 } 5039 5040 unsigned MaxVF = MaxVectorSize; 5041 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5042 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5043 // Collect all viable vectorization factors larger than the default MaxVF 5044 // (i.e. MaxVectorSize). 5045 SmallVector<unsigned, 8> VFs; 5046 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5047 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5048 VFs.push_back(VS); 5049 5050 // For each VF calculate its register usage. 5051 auto RUs = calculateRegisterUsage(VFs); 5052 5053 // Select the largest VF which doesn't require more registers than existing 5054 // ones. 5055 for (int i = RUs.size() - 1; i >= 0; --i) { 5056 bool Selected = true; 5057 for (auto& pair : RUs[i].MaxLocalUsers) { 5058 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5059 if (pair.second > TargetNumRegisters) 5060 Selected = false; 5061 } 5062 if (Selected) { 5063 MaxVF = VFs[i]; 5064 break; 5065 } 5066 } 5067 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5068 if (MaxVF < MinVF) { 5069 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5070 << ") with target's minimum: " << MinVF << '\n'); 5071 MaxVF = MinVF; 5072 } 5073 } 5074 } 5075 return MaxVF; 5076 } 5077 5078 VectorizationFactor 5079 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5080 float Cost = expectedCost(1).first; 5081 const float ScalarCost = Cost; 5082 unsigned Width = 1; 5083 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5084 5085 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5086 if (ForceVectorization && MaxVF > 1) { 5087 // Ignore scalar width, because the user explicitly wants vectorization. 5088 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5089 // evaluation. 5090 Cost = std::numeric_limits<float>::max(); 5091 } 5092 5093 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5094 // Notice that the vector loop needs to be executed less times, so 5095 // we need to divide the cost of the vector loops by the width of 5096 // the vector elements. 5097 VectorizationCostTy C = expectedCost(i); 5098 float VectorCost = C.first / (float)i; 5099 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5100 << " costs: " << (int)VectorCost << ".\n"); 5101 if (!C.second && !ForceVectorization) { 5102 LLVM_DEBUG( 5103 dbgs() << "LV: Not considering vector loop of width " << i 5104 << " because it will not generate any vector instructions.\n"); 5105 continue; 5106 } 5107 if (VectorCost < Cost) { 5108 Cost = VectorCost; 5109 Width = i; 5110 } 5111 } 5112 5113 if (!EnableCondStoresVectorization && NumPredStores) { 5114 reportVectorizationFailure("There are conditional stores.", 5115 "store that is conditionally executed prevents vectorization", 5116 "ConditionalStore", ORE, TheLoop); 5117 Width = 1; 5118 Cost = ScalarCost; 5119 } 5120 5121 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5122 << "LV: Vectorization seems to be not beneficial, " 5123 << "but was forced by a user.\n"); 5124 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5125 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5126 return Factor; 5127 } 5128 5129 std::pair<unsigned, unsigned> 5130 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5131 unsigned MinWidth = -1U; 5132 unsigned MaxWidth = 8; 5133 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5134 5135 // For each block. 5136 for (BasicBlock *BB : TheLoop->blocks()) { 5137 // For each instruction in the loop. 5138 for (Instruction &I : BB->instructionsWithoutDebug()) { 5139 Type *T = I.getType(); 5140 5141 // Skip ignored values. 5142 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) 5143 continue; 5144 5145 // Only examine Loads, Stores and PHINodes. 5146 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5147 continue; 5148 5149 // Examine PHI nodes that are reduction variables. Update the type to 5150 // account for the recurrence type. 5151 if (auto *PN = dyn_cast<PHINode>(&I)) { 5152 if (!Legal->isReductionVariable(PN)) 5153 continue; 5154 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; 5155 T = RdxDesc.getRecurrenceType(); 5156 } 5157 5158 // Examine the stored values. 5159 if (auto *ST = dyn_cast<StoreInst>(&I)) 5160 T = ST->getValueOperand()->getType(); 5161 5162 // Ignore loaded pointer types and stored pointer types that are not 5163 // vectorizable. 5164 // 5165 // FIXME: The check here attempts to predict whether a load or store will 5166 // be vectorized. We only know this for certain after a VF has 5167 // been selected. Here, we assume that if an access can be 5168 // vectorized, it will be. We should also look at extending this 5169 // optimization to non-pointer types. 5170 // 5171 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5172 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5173 continue; 5174 5175 MinWidth = std::min(MinWidth, 5176 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5177 MaxWidth = std::max(MaxWidth, 5178 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5179 } 5180 } 5181 5182 return {MinWidth, MaxWidth}; 5183 } 5184 5185 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5186 unsigned LoopCost) { 5187 // -- The interleave heuristics -- 5188 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5189 // There are many micro-architectural considerations that we can't predict 5190 // at this level. For example, frontend pressure (on decode or fetch) due to 5191 // code size, or the number and capabilities of the execution ports. 5192 // 5193 // We use the following heuristics to select the interleave count: 5194 // 1. If the code has reductions, then we interleave to break the cross 5195 // iteration dependency. 5196 // 2. If the loop is really small, then we interleave to reduce the loop 5197 // overhead. 5198 // 3. We don't interleave if we think that we will spill registers to memory 5199 // due to the increased register pressure. 5200 5201 if (!isScalarEpilogueAllowed()) 5202 return 1; 5203 5204 // We used the distance for the interleave count. 5205 if (Legal->getMaxSafeDepDistBytes() != -1U) 5206 return 1; 5207 5208 // Do not interleave loops with a relatively small known or estimated trip 5209 // count. 5210 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5211 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5212 return 1; 5213 5214 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5215 // We divide by these constants so assume that we have at least one 5216 // instruction that uses at least one register. 5217 for (auto& pair : R.MaxLocalUsers) { 5218 pair.second = std::max(pair.second, 1U); 5219 } 5220 5221 // We calculate the interleave count using the following formula. 5222 // Subtract the number of loop invariants from the number of available 5223 // registers. These registers are used by all of the interleaved instances. 5224 // Next, divide the remaining registers by the number of registers that is 5225 // required by the loop, in order to estimate how many parallel instances 5226 // fit without causing spills. All of this is rounded down if necessary to be 5227 // a power of two. We want power of two interleave count to simplify any 5228 // addressing operations or alignment considerations. 5229 // We also want power of two interleave counts to ensure that the induction 5230 // variable of the vector loop wraps to zero, when tail is folded by masking; 5231 // this currently happens when OptForSize, in which case IC is set to 1 above. 5232 unsigned IC = UINT_MAX; 5233 5234 for (auto& pair : R.MaxLocalUsers) { 5235 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5236 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5237 << " registers of " 5238 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5239 if (VF == 1) { 5240 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5241 TargetNumRegisters = ForceTargetNumScalarRegs; 5242 } else { 5243 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5244 TargetNumRegisters = ForceTargetNumVectorRegs; 5245 } 5246 unsigned MaxLocalUsers = pair.second; 5247 unsigned LoopInvariantRegs = 0; 5248 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5249 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5250 5251 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5252 // Don't count the induction variable as interleaved. 5253 if (EnableIndVarRegisterHeur) { 5254 TmpIC = 5255 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5256 std::max(1U, (MaxLocalUsers - 1))); 5257 } 5258 5259 IC = std::min(IC, TmpIC); 5260 } 5261 5262 // Clamp the interleave ranges to reasonable counts. 5263 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5264 5265 // Check if the user has overridden the max. 5266 if (VF == 1) { 5267 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5268 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5269 } else { 5270 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5271 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5272 } 5273 5274 // If trip count is known or estimated compile time constant, limit the 5275 // interleave count to be less than the trip count divided by VF. 5276 if (BestKnownTC) { 5277 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); 5278 } 5279 5280 // If we did not calculate the cost for VF (because the user selected the VF) 5281 // then we calculate the cost of VF here. 5282 if (LoopCost == 0) 5283 LoopCost = expectedCost(VF).first; 5284 5285 assert(LoopCost && "Non-zero loop cost expected"); 5286 5287 // Clamp the calculated IC to be between the 1 and the max interleave count 5288 // that the target and trip count allows. 5289 if (IC > MaxInterleaveCount) 5290 IC = MaxInterleaveCount; 5291 else if (IC < 1) 5292 IC = 1; 5293 5294 // Interleave if we vectorized this loop and there is a reduction that could 5295 // benefit from interleaving. 5296 if (VF > 1 && !Legal->getReductionVars()->empty()) { 5297 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5298 return IC; 5299 } 5300 5301 // Note that if we've already vectorized the loop we will have done the 5302 // runtime check and so interleaving won't require further checks. 5303 bool InterleavingRequiresRuntimePointerCheck = 5304 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5305 5306 // We want to interleave small loops in order to reduce the loop overhead and 5307 // potentially expose ILP opportunities. 5308 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5309 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5310 // We assume that the cost overhead is 1 and we use the cost model 5311 // to estimate the cost of the loop and interleave until the cost of the 5312 // loop overhead is about 5% of the cost of the loop. 5313 unsigned SmallIC = 5314 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5315 5316 // Interleave until store/load ports (estimated by max interleave count) are 5317 // saturated. 5318 unsigned NumStores = Legal->getNumStores(); 5319 unsigned NumLoads = Legal->getNumLoads(); 5320 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5321 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5322 5323 // If we have a scalar reduction (vector reductions are already dealt with 5324 // by this point), we can increase the critical path length if the loop 5325 // we're interleaving is inside another loop. Limit, by default to 2, so the 5326 // critical path only gets increased by one reduction operation. 5327 if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) { 5328 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5329 SmallIC = std::min(SmallIC, F); 5330 StoresIC = std::min(StoresIC, F); 5331 LoadsIC = std::min(LoadsIC, F); 5332 } 5333 5334 if (EnableLoadStoreRuntimeInterleave && 5335 std::max(StoresIC, LoadsIC) > SmallIC) { 5336 LLVM_DEBUG( 5337 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5338 return std::max(StoresIC, LoadsIC); 5339 } 5340 5341 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5342 return SmallIC; 5343 } 5344 5345 // Interleave if this is a large loop (small loops are already dealt with by 5346 // this point) that could benefit from interleaving. 5347 bool HasReductions = !Legal->getReductionVars()->empty(); 5348 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5349 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5350 return IC; 5351 } 5352 5353 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5354 return 1; 5355 } 5356 5357 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5358 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5359 // This function calculates the register usage by measuring the highest number 5360 // of values that are alive at a single location. Obviously, this is a very 5361 // rough estimation. We scan the loop in a topological order in order and 5362 // assign a number to each instruction. We use RPO to ensure that defs are 5363 // met before their users. We assume that each instruction that has in-loop 5364 // users starts an interval. We record every time that an in-loop value is 5365 // used, so we have a list of the first and last occurrences of each 5366 // instruction. Next, we transpose this data structure into a multi map that 5367 // holds the list of intervals that *end* at a specific location. This multi 5368 // map allows us to perform a linear search. We scan the instructions linearly 5369 // and record each time that a new interval starts, by placing it in a set. 5370 // If we find this value in the multi-map then we remove it from the set. 5371 // The max register usage is the maximum size of the set. 5372 // We also search for instructions that are defined outside the loop, but are 5373 // used inside the loop. We need this number separately from the max-interval 5374 // usage number because when we unroll, loop-invariant values do not take 5375 // more register. 5376 LoopBlocksDFS DFS(TheLoop); 5377 DFS.perform(LI); 5378 5379 RegisterUsage RU; 5380 5381 // Each 'key' in the map opens a new interval. The values 5382 // of the map are the index of the 'last seen' usage of the 5383 // instruction that is the key. 5384 using IntervalMap = DenseMap<Instruction *, unsigned>; 5385 5386 // Maps instruction to its index. 5387 SmallVector<Instruction *, 64> IdxToInstr; 5388 // Marks the end of each interval. 5389 IntervalMap EndPoint; 5390 // Saves the list of instruction indices that are used in the loop. 5391 SmallPtrSet<Instruction *, 8> Ends; 5392 // Saves the list of values that are used in the loop but are 5393 // defined outside the loop, such as arguments and constants. 5394 SmallPtrSet<Value *, 8> LoopInvariants; 5395 5396 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5397 for (Instruction &I : BB->instructionsWithoutDebug()) { 5398 IdxToInstr.push_back(&I); 5399 5400 // Save the end location of each USE. 5401 for (Value *U : I.operands()) { 5402 auto *Instr = dyn_cast<Instruction>(U); 5403 5404 // Ignore non-instruction values such as arguments, constants, etc. 5405 if (!Instr) 5406 continue; 5407 5408 // If this instruction is outside the loop then record it and continue. 5409 if (!TheLoop->contains(Instr)) { 5410 LoopInvariants.insert(Instr); 5411 continue; 5412 } 5413 5414 // Overwrite previous end points. 5415 EndPoint[Instr] = IdxToInstr.size(); 5416 Ends.insert(Instr); 5417 } 5418 } 5419 } 5420 5421 // Saves the list of intervals that end with the index in 'key'. 5422 using InstrList = SmallVector<Instruction *, 2>; 5423 DenseMap<unsigned, InstrList> TransposeEnds; 5424 5425 // Transpose the EndPoints to a list of values that end at each index. 5426 for (auto &Interval : EndPoint) 5427 TransposeEnds[Interval.second].push_back(Interval.first); 5428 5429 SmallPtrSet<Instruction *, 8> OpenIntervals; 5430 5431 // Get the size of the widest register. 5432 unsigned MaxSafeDepDist = -1U; 5433 if (Legal->getMaxSafeDepDistBytes() != -1U) 5434 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5435 unsigned WidestRegister = 5436 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5437 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5438 5439 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5440 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5441 5442 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5443 5444 // A lambda that gets the register usage for the given type and VF. 5445 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5446 if (Ty->isTokenTy()) 5447 return 0U; 5448 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5449 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5450 }; 5451 5452 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5453 Instruction *I = IdxToInstr[i]; 5454 5455 // Remove all of the instructions that end at this location. 5456 InstrList &List = TransposeEnds[i]; 5457 for (Instruction *ToRemove : List) 5458 OpenIntervals.erase(ToRemove); 5459 5460 // Ignore instructions that are never used within the loop. 5461 if (Ends.find(I) == Ends.end()) 5462 continue; 5463 5464 // Skip ignored values. 5465 if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) 5466 continue; 5467 5468 // For each VF find the maximum usage of registers. 5469 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5470 // Count the number of live intervals. 5471 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5472 5473 if (VFs[j] == 1) { 5474 for (auto Inst : OpenIntervals) { 5475 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5476 if (RegUsage.find(ClassID) == RegUsage.end()) 5477 RegUsage[ClassID] = 1; 5478 else 5479 RegUsage[ClassID] += 1; 5480 } 5481 } else { 5482 collectUniformsAndScalars(VFs[j]); 5483 for (auto Inst : OpenIntervals) { 5484 // Skip ignored values for VF > 1. 5485 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) 5486 continue; 5487 if (isScalarAfterVectorization(Inst, VFs[j])) { 5488 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5489 if (RegUsage.find(ClassID) == RegUsage.end()) 5490 RegUsage[ClassID] = 1; 5491 else 5492 RegUsage[ClassID] += 1; 5493 } else { 5494 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5495 if (RegUsage.find(ClassID) == RegUsage.end()) 5496 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5497 else 5498 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5499 } 5500 } 5501 } 5502 5503 for (auto& pair : RegUsage) { 5504 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5505 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5506 else 5507 MaxUsages[j][pair.first] = pair.second; 5508 } 5509 } 5510 5511 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5512 << OpenIntervals.size() << '\n'); 5513 5514 // Add the current instruction to the list of open intervals. 5515 OpenIntervals.insert(I); 5516 } 5517 5518 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5519 SmallMapVector<unsigned, unsigned, 4> Invariant; 5520 5521 for (auto Inst : LoopInvariants) { 5522 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5523 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); 5524 if (Invariant.find(ClassID) == Invariant.end()) 5525 Invariant[ClassID] = Usage; 5526 else 5527 Invariant[ClassID] += Usage; 5528 } 5529 5530 LLVM_DEBUG({ 5531 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5532 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5533 << " item\n"; 5534 for (const auto &pair : MaxUsages[i]) { 5535 dbgs() << "LV(REG): RegisterClass: " 5536 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5537 << " registers\n"; 5538 } 5539 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5540 << " item\n"; 5541 for (const auto &pair : Invariant) { 5542 dbgs() << "LV(REG): RegisterClass: " 5543 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5544 << " registers\n"; 5545 } 5546 }); 5547 5548 RU.LoopInvariantRegs = Invariant; 5549 RU.MaxLocalUsers = MaxUsages[i]; 5550 RUs[i] = RU; 5551 } 5552 5553 return RUs; 5554 } 5555 5556 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5557 // TODO: Cost model for emulated masked load/store is completely 5558 // broken. This hack guides the cost model to use an artificially 5559 // high enough value to practically disable vectorization with such 5560 // operations, except where previously deployed legality hack allowed 5561 // using very low cost values. This is to avoid regressions coming simply 5562 // from moving "masked load/store" check from legality to cost model. 5563 // Masked Load/Gather emulation was previously never allowed. 5564 // Limited number of Masked Store/Scatter emulation was allowed. 5565 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5566 return isa<LoadInst>(I) || 5567 (isa<StoreInst>(I) && 5568 NumPredStores > NumberOfStoresToPredicate); 5569 } 5570 5571 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5572 // If we aren't vectorizing the loop, or if we've already collected the 5573 // instructions to scalarize, there's nothing to do. Collection may already 5574 // have occurred if we have a user-selected VF and are now computing the 5575 // expected cost for interleaving. 5576 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5577 return; 5578 5579 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5580 // not profitable to scalarize any instructions, the presence of VF in the 5581 // map will indicate that we've analyzed it already. 5582 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5583 5584 // Find all the instructions that are scalar with predication in the loop and 5585 // determine if it would be better to not if-convert the blocks they are in. 5586 // If so, we also record the instructions to scalarize. 5587 for (BasicBlock *BB : TheLoop->blocks()) { 5588 if (!blockNeedsPredication(BB)) 5589 continue; 5590 for (Instruction &I : *BB) 5591 if (isScalarWithPredication(&I)) { 5592 ScalarCostsTy ScalarCosts; 5593 // Do not apply discount logic if hacked cost is needed 5594 // for emulated masked memrefs. 5595 if (!useEmulatedMaskMemRefHack(&I) && 5596 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5597 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5598 // Remember that BB will remain after vectorization. 5599 PredicatedBBsAfterVectorization.insert(BB); 5600 } 5601 } 5602 } 5603 5604 int LoopVectorizationCostModel::computePredInstDiscount( 5605 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5606 unsigned VF) { 5607 assert(!isUniformAfterVectorization(PredInst, VF) && 5608 "Instruction marked uniform-after-vectorization will be predicated"); 5609 5610 // Initialize the discount to zero, meaning that the scalar version and the 5611 // vector version cost the same. 5612 int Discount = 0; 5613 5614 // Holds instructions to analyze. The instructions we visit are mapped in 5615 // ScalarCosts. Those instructions are the ones that would be scalarized if 5616 // we find that the scalar version costs less. 5617 SmallVector<Instruction *, 8> Worklist; 5618 5619 // Returns true if the given instruction can be scalarized. 5620 auto canBeScalarized = [&](Instruction *I) -> bool { 5621 // We only attempt to scalarize instructions forming a single-use chain 5622 // from the original predicated block that would otherwise be vectorized. 5623 // Although not strictly necessary, we give up on instructions we know will 5624 // already be scalar to avoid traversing chains that are unlikely to be 5625 // beneficial. 5626 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5627 isScalarAfterVectorization(I, VF)) 5628 return false; 5629 5630 // If the instruction is scalar with predication, it will be analyzed 5631 // separately. We ignore it within the context of PredInst. 5632 if (isScalarWithPredication(I)) 5633 return false; 5634 5635 // If any of the instruction's operands are uniform after vectorization, 5636 // the instruction cannot be scalarized. This prevents, for example, a 5637 // masked load from being scalarized. 5638 // 5639 // We assume we will only emit a value for lane zero of an instruction 5640 // marked uniform after vectorization, rather than VF identical values. 5641 // Thus, if we scalarize an instruction that uses a uniform, we would 5642 // create uses of values corresponding to the lanes we aren't emitting code 5643 // for. This behavior can be changed by allowing getScalarValue to clone 5644 // the lane zero values for uniforms rather than asserting. 5645 for (Use &U : I->operands()) 5646 if (auto *J = dyn_cast<Instruction>(U.get())) 5647 if (isUniformAfterVectorization(J, VF)) 5648 return false; 5649 5650 // Otherwise, we can scalarize the instruction. 5651 return true; 5652 }; 5653 5654 // Compute the expected cost discount from scalarizing the entire expression 5655 // feeding the predicated instruction. We currently only consider expressions 5656 // that are single-use instruction chains. 5657 Worklist.push_back(PredInst); 5658 while (!Worklist.empty()) { 5659 Instruction *I = Worklist.pop_back_val(); 5660 5661 // If we've already analyzed the instruction, there's nothing to do. 5662 if (ScalarCosts.find(I) != ScalarCosts.end()) 5663 continue; 5664 5665 // Compute the cost of the vector instruction. Note that this cost already 5666 // includes the scalarization overhead of the predicated instruction. 5667 unsigned VectorCost = getInstructionCost(I, VF).first; 5668 5669 // Compute the cost of the scalarized instruction. This cost is the cost of 5670 // the instruction as if it wasn't if-converted and instead remained in the 5671 // predicated block. We will scale this cost by block probability after 5672 // computing the scalarization overhead. 5673 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5674 5675 // Compute the scalarization overhead of needed insertelement instructions 5676 // and phi nodes. 5677 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5678 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), 5679 true, false); 5680 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); 5681 } 5682 5683 // Compute the scalarization overhead of needed extractelement 5684 // instructions. For each of the instruction's operands, if the operand can 5685 // be scalarized, add it to the worklist; otherwise, account for the 5686 // overhead. 5687 for (Use &U : I->operands()) 5688 if (auto *J = dyn_cast<Instruction>(U.get())) { 5689 assert(VectorType::isValidElementType(J->getType()) && 5690 "Instruction has non-scalar type"); 5691 if (canBeScalarized(J)) 5692 Worklist.push_back(J); 5693 else if (needsExtract(J, VF)) 5694 ScalarCost += TTI.getScalarizationOverhead( 5695 ToVectorTy(J->getType(),VF), false, true); 5696 } 5697 5698 // Scale the total scalar cost by block probability. 5699 ScalarCost /= getReciprocalPredBlockProb(); 5700 5701 // Compute the discount. A non-negative discount means the vector version 5702 // of the instruction costs more, and scalarizing would be beneficial. 5703 Discount += VectorCost - ScalarCost; 5704 ScalarCosts[I] = ScalarCost; 5705 } 5706 5707 return Discount; 5708 } 5709 5710 LoopVectorizationCostModel::VectorizationCostTy 5711 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5712 VectorizationCostTy Cost; 5713 5714 // For each block. 5715 for (BasicBlock *BB : TheLoop->blocks()) { 5716 VectorizationCostTy BlockCost; 5717 5718 // For each instruction in the old loop. 5719 for (Instruction &I : BB->instructionsWithoutDebug()) { 5720 // Skip ignored values. 5721 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || 5722 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) 5723 continue; 5724 5725 VectorizationCostTy C = getInstructionCost(&I, VF); 5726 5727 // Check if we should override the cost. 5728 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5729 C.first = ForceTargetInstructionCost; 5730 5731 BlockCost.first += C.first; 5732 BlockCost.second |= C.second; 5733 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5734 << " for VF " << VF << " For instruction: " << I 5735 << '\n'); 5736 } 5737 5738 // If we are vectorizing a predicated block, it will have been 5739 // if-converted. This means that the block's instructions (aside from 5740 // stores and instructions that may divide by zero) will now be 5741 // unconditionally executed. For the scalar case, we may not always execute 5742 // the predicated block. Thus, scale the block's cost by the probability of 5743 // executing it. 5744 if (VF == 1 && blockNeedsPredication(BB)) 5745 BlockCost.first /= getReciprocalPredBlockProb(); 5746 5747 Cost.first += BlockCost.first; 5748 Cost.second |= BlockCost.second; 5749 } 5750 5751 return Cost; 5752 } 5753 5754 /// Gets Address Access SCEV after verifying that the access pattern 5755 /// is loop invariant except the induction variable dependence. 5756 /// 5757 /// This SCEV can be sent to the Target in order to estimate the address 5758 /// calculation cost. 5759 static const SCEV *getAddressAccessSCEV( 5760 Value *Ptr, 5761 LoopVectorizationLegality *Legal, 5762 PredicatedScalarEvolution &PSE, 5763 const Loop *TheLoop) { 5764 5765 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5766 if (!Gep) 5767 return nullptr; 5768 5769 // We are looking for a gep with all loop invariant indices except for one 5770 // which should be an induction variable. 5771 auto SE = PSE.getSE(); 5772 unsigned NumOperands = Gep->getNumOperands(); 5773 for (unsigned i = 1; i < NumOperands; ++i) { 5774 Value *Opd = Gep->getOperand(i); 5775 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5776 !Legal->isInductionVariable(Opd)) 5777 return nullptr; 5778 } 5779 5780 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5781 return PSE.getSCEV(Ptr); 5782 } 5783 5784 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5785 return Legal->hasStride(I->getOperand(0)) || 5786 Legal->hasStride(I->getOperand(1)); 5787 } 5788 5789 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5790 unsigned VF) { 5791 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5792 Type *ValTy = getMemInstValueType(I); 5793 auto SE = PSE.getSE(); 5794 5795 unsigned AS = getLoadStoreAddressSpace(I); 5796 Value *Ptr = getLoadStorePointerOperand(I); 5797 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5798 5799 // Figure out whether the access is strided and get the stride value 5800 // if it's known in compile time 5801 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5802 5803 // Get the cost of the scalar memory instruction and address computation. 5804 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5805 5806 // Don't pass *I here, since it is scalar but will actually be part of a 5807 // vectorized loop where the user of it is a vectorized instruction. 5808 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5809 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5810 Alignment, AS); 5811 5812 // Get the overhead of the extractelement and insertelement instructions 5813 // we might create due to scalarization. 5814 Cost += getScalarizationOverhead(I, VF); 5815 5816 // If we have a predicated store, it may not be executed for each vector 5817 // lane. Scale the cost by the probability of executing the predicated 5818 // block. 5819 if (isPredicatedInst(I)) { 5820 Cost /= getReciprocalPredBlockProb(); 5821 5822 if (useEmulatedMaskMemRefHack(I)) 5823 // Artificially setting to a high enough value to practically disable 5824 // vectorization with such operations. 5825 Cost = 3000000; 5826 } 5827 5828 return Cost; 5829 } 5830 5831 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5832 unsigned VF) { 5833 Type *ValTy = getMemInstValueType(I); 5834 Type *VectorTy = ToVectorTy(ValTy, VF); 5835 Value *Ptr = getLoadStorePointerOperand(I); 5836 unsigned AS = getLoadStoreAddressSpace(I); 5837 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5838 5839 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5840 "Stride should be 1 or -1 for consecutive memory access"); 5841 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5842 unsigned Cost = 0; 5843 if (Legal->isMaskRequired(I)) 5844 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, 5845 Alignment ? Alignment->value() : 0, AS); 5846 else 5847 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); 5848 5849 bool Reverse = ConsecutiveStride < 0; 5850 if (Reverse) 5851 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5852 return Cost; 5853 } 5854 5855 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5856 unsigned VF) { 5857 Type *ValTy = getMemInstValueType(I); 5858 Type *VectorTy = ToVectorTy(ValTy, VF); 5859 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5860 unsigned AS = getLoadStoreAddressSpace(I); 5861 if (isa<LoadInst>(I)) { 5862 return TTI.getAddressComputationCost(ValTy) + 5863 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + 5864 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5865 } 5866 StoreInst *SI = cast<StoreInst>(I); 5867 5868 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5869 return TTI.getAddressComputationCost(ValTy) + 5870 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + 5871 (isLoopInvariantStoreValue 5872 ? 0 5873 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5874 VF - 1)); 5875 } 5876 5877 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5878 unsigned VF) { 5879 Type *ValTy = getMemInstValueType(I); 5880 Type *VectorTy = ToVectorTy(ValTy, VF); 5881 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5882 Value *Ptr = getLoadStorePointerOperand(I); 5883 5884 return TTI.getAddressComputationCost(VectorTy) + 5885 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5886 Legal->isMaskRequired(I), 5887 Alignment ? Alignment->value() : 0); 5888 } 5889 5890 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5891 unsigned VF) { 5892 Type *ValTy = getMemInstValueType(I); 5893 Type *VectorTy = ToVectorTy(ValTy, VF); 5894 unsigned AS = getLoadStoreAddressSpace(I); 5895 5896 auto Group = getInterleavedAccessGroup(I); 5897 assert(Group && "Fail to get an interleaved access group."); 5898 5899 unsigned InterleaveFactor = Group->getFactor(); 5900 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5901 5902 // Holds the indices of existing members in an interleaved load group. 5903 // An interleaved store group doesn't need this as it doesn't allow gaps. 5904 SmallVector<unsigned, 4> Indices; 5905 if (isa<LoadInst>(I)) { 5906 for (unsigned i = 0; i < InterleaveFactor; i++) 5907 if (Group->getMember(i)) 5908 Indices.push_back(i); 5909 } 5910 5911 // Calculate the cost of the whole interleaved group. 5912 bool UseMaskForGaps = 5913 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5914 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5915 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5916 Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); 5917 5918 if (Group->isReverse()) { 5919 // TODO: Add support for reversed masked interleaved access. 5920 assert(!Legal->isMaskRequired(I) && 5921 "Reverse masked interleaved access not supported."); 5922 Cost += Group->getNumMembers() * 5923 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5924 } 5925 return Cost; 5926 } 5927 5928 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5929 unsigned VF) { 5930 // Calculate scalar cost only. Vectorization cost should be ready at this 5931 // moment. 5932 if (VF == 1) { 5933 Type *ValTy = getMemInstValueType(I); 5934 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5935 unsigned AS = getLoadStoreAddressSpace(I); 5936 5937 return TTI.getAddressComputationCost(ValTy) + 5938 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); 5939 } 5940 return getWideningCost(I, VF); 5941 } 5942 5943 LoopVectorizationCostModel::VectorizationCostTy 5944 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5945 // If we know that this instruction will remain uniform, check the cost of 5946 // the scalar version. 5947 if (isUniformAfterVectorization(I, VF)) 5948 VF = 1; 5949 5950 if (VF > 1 && isProfitableToScalarize(I, VF)) 5951 return VectorizationCostTy(InstsToScalarize[VF][I], false); 5952 5953 // Forced scalars do not have any scalarization overhead. 5954 auto ForcedScalar = ForcedScalars.find(VF); 5955 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 5956 auto InstSet = ForcedScalar->second; 5957 if (InstSet.find(I) != InstSet.end()) 5958 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 5959 } 5960 5961 Type *VectorTy; 5962 unsigned C = getInstructionCost(I, VF, VectorTy); 5963 5964 bool TypeNotScalarized = 5965 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 5966 return VectorizationCostTy(C, TypeNotScalarized); 5967 } 5968 5969 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 5970 unsigned VF) { 5971 5972 if (VF == 1) 5973 return 0; 5974 5975 unsigned Cost = 0; 5976 Type *RetTy = ToVectorTy(I->getType(), VF); 5977 if (!RetTy->isVoidTy() && 5978 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 5979 Cost += TTI.getScalarizationOverhead(RetTy, true, false); 5980 5981 // Some targets keep addresses scalar. 5982 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 5983 return Cost; 5984 5985 // Some targets support efficient element stores. 5986 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 5987 return Cost; 5988 5989 // Collect operands to consider. 5990 CallInst *CI = dyn_cast<CallInst>(I); 5991 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 5992 5993 // Skip operands that do not require extraction/scalarization and do not incur 5994 // any overhead. 5995 return Cost + TTI.getOperandsScalarizationOverhead( 5996 filterExtractingOperands(Ops, VF), VF); 5997 } 5998 5999 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 6000 if (VF == 1) 6001 return; 6002 NumPredStores = 0; 6003 for (BasicBlock *BB : TheLoop->blocks()) { 6004 // For each instruction in the old loop. 6005 for (Instruction &I : *BB) { 6006 Value *Ptr = getLoadStorePointerOperand(&I); 6007 if (!Ptr) 6008 continue; 6009 6010 // TODO: We should generate better code and update the cost model for 6011 // predicated uniform stores. Today they are treated as any other 6012 // predicated store (see added test cases in 6013 // invariant-store-vectorization.ll). 6014 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6015 NumPredStores++; 6016 6017 if (Legal->isUniform(Ptr) && 6018 // Conditional loads and stores should be scalarized and predicated. 6019 // isScalarWithPredication cannot be used here since masked 6020 // gather/scatters are not considered scalar with predication. 6021 !Legal->blockNeedsPredication(I.getParent())) { 6022 // TODO: Avoid replicating loads and stores instead of 6023 // relying on instcombine to remove them. 6024 // Load: Scalar load + broadcast 6025 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6026 unsigned Cost = getUniformMemOpCost(&I, VF); 6027 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6028 continue; 6029 } 6030 6031 // We assume that widening is the best solution when possible. 6032 if (memoryInstructionCanBeWidened(&I, VF)) { 6033 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6034 int ConsecutiveStride = 6035 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6036 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6037 "Expected consecutive stride."); 6038 InstWidening Decision = 6039 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6040 setWideningDecision(&I, VF, Decision, Cost); 6041 continue; 6042 } 6043 6044 // Choose between Interleaving, Gather/Scatter or Scalarization. 6045 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6046 unsigned NumAccesses = 1; 6047 if (isAccessInterleaved(&I)) { 6048 auto Group = getInterleavedAccessGroup(&I); 6049 assert(Group && "Fail to get an interleaved access group."); 6050 6051 // Make one decision for the whole group. 6052 if (getWideningDecision(&I, VF) != CM_Unknown) 6053 continue; 6054 6055 NumAccesses = Group->getNumMembers(); 6056 if (interleavedAccessCanBeWidened(&I, VF)) 6057 InterleaveCost = getInterleaveGroupCost(&I, VF); 6058 } 6059 6060 unsigned GatherScatterCost = 6061 isLegalGatherOrScatter(&I) 6062 ? getGatherScatterCost(&I, VF) * NumAccesses 6063 : std::numeric_limits<unsigned>::max(); 6064 6065 unsigned ScalarizationCost = 6066 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6067 6068 // Choose better solution for the current VF, 6069 // write down this decision and use it during vectorization. 6070 unsigned Cost; 6071 InstWidening Decision; 6072 if (InterleaveCost <= GatherScatterCost && 6073 InterleaveCost < ScalarizationCost) { 6074 Decision = CM_Interleave; 6075 Cost = InterleaveCost; 6076 } else if (GatherScatterCost < ScalarizationCost) { 6077 Decision = CM_GatherScatter; 6078 Cost = GatherScatterCost; 6079 } else { 6080 Decision = CM_Scalarize; 6081 Cost = ScalarizationCost; 6082 } 6083 // If the instructions belongs to an interleave group, the whole group 6084 // receives the same decision. The whole group receives the cost, but 6085 // the cost will actually be assigned to one instruction. 6086 if (auto Group = getInterleavedAccessGroup(&I)) 6087 setWideningDecision(Group, VF, Decision, Cost); 6088 else 6089 setWideningDecision(&I, VF, Decision, Cost); 6090 } 6091 } 6092 6093 // Make sure that any load of address and any other address computation 6094 // remains scalar unless there is gather/scatter support. This avoids 6095 // inevitable extracts into address registers, and also has the benefit of 6096 // activating LSR more, since that pass can't optimize vectorized 6097 // addresses. 6098 if (TTI.prefersVectorizedAddressing()) 6099 return; 6100 6101 // Start with all scalar pointer uses. 6102 SmallPtrSet<Instruction *, 8> AddrDefs; 6103 for (BasicBlock *BB : TheLoop->blocks()) 6104 for (Instruction &I : *BB) { 6105 Instruction *PtrDef = 6106 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6107 if (PtrDef && TheLoop->contains(PtrDef) && 6108 getWideningDecision(&I, VF) != CM_GatherScatter) 6109 AddrDefs.insert(PtrDef); 6110 } 6111 6112 // Add all instructions used to generate the addresses. 6113 SmallVector<Instruction *, 4> Worklist; 6114 for (auto *I : AddrDefs) 6115 Worklist.push_back(I); 6116 while (!Worklist.empty()) { 6117 Instruction *I = Worklist.pop_back_val(); 6118 for (auto &Op : I->operands()) 6119 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6120 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6121 AddrDefs.insert(InstOp).second) 6122 Worklist.push_back(InstOp); 6123 } 6124 6125 for (auto *I : AddrDefs) { 6126 if (isa<LoadInst>(I)) { 6127 // Setting the desired widening decision should ideally be handled in 6128 // by cost functions, but since this involves the task of finding out 6129 // if the loaded register is involved in an address computation, it is 6130 // instead changed here when we know this is the case. 6131 InstWidening Decision = getWideningDecision(I, VF); 6132 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6133 // Scalarize a widened load of address. 6134 setWideningDecision(I, VF, CM_Scalarize, 6135 (VF * getMemoryInstructionCost(I, 1))); 6136 else if (auto Group = getInterleavedAccessGroup(I)) { 6137 // Scalarize an interleave group of address loads. 6138 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6139 if (Instruction *Member = Group->getMember(I)) 6140 setWideningDecision(Member, VF, CM_Scalarize, 6141 (VF * getMemoryInstructionCost(Member, 1))); 6142 } 6143 } 6144 } else 6145 // Make sure I gets scalarized and a cost estimate without 6146 // scalarization overhead. 6147 ForcedScalars[VF].insert(I); 6148 } 6149 } 6150 6151 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6152 unsigned VF, 6153 Type *&VectorTy) { 6154 Type *RetTy = I->getType(); 6155 if (canTruncateToMinimalBitwidth(I, VF)) 6156 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6157 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6158 auto SE = PSE.getSE(); 6159 6160 // TODO: We need to estimate the cost of intrinsic calls. 6161 switch (I->getOpcode()) { 6162 case Instruction::GetElementPtr: 6163 // We mark this instruction as zero-cost because the cost of GEPs in 6164 // vectorized code depends on whether the corresponding memory instruction 6165 // is scalarized or not. Therefore, we handle GEPs with the memory 6166 // instruction cost. 6167 return 0; 6168 case Instruction::Br: { 6169 // In cases of scalarized and predicated instructions, there will be VF 6170 // predicated blocks in the vectorized loop. Each branch around these 6171 // blocks requires also an extract of its vector compare i1 element. 6172 bool ScalarPredicatedBB = false; 6173 BranchInst *BI = cast<BranchInst>(I); 6174 if (VF > 1 && BI->isConditional() && 6175 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != 6176 PredicatedBBsAfterVectorization.end() || 6177 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != 6178 PredicatedBBsAfterVectorization.end())) 6179 ScalarPredicatedBB = true; 6180 6181 if (ScalarPredicatedBB) { 6182 // Return cost for branches around scalarized and predicated blocks. 6183 Type *Vec_i1Ty = 6184 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6185 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + 6186 (TTI.getCFInstrCost(Instruction::Br) * VF)); 6187 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6188 // The back-edge branch will remain, as will all scalar branches. 6189 return TTI.getCFInstrCost(Instruction::Br); 6190 else 6191 // This branch will be eliminated by if-conversion. 6192 return 0; 6193 // Note: We currently assume zero cost for an unconditional branch inside 6194 // a predicated block since it will become a fall-through, although we 6195 // may decide in the future to call TTI for all branches. 6196 } 6197 case Instruction::PHI: { 6198 auto *Phi = cast<PHINode>(I); 6199 6200 // First-order recurrences are replaced by vector shuffles inside the loop. 6201 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6202 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6203 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6204 VectorTy, VF - 1, VectorType::get(RetTy, 1)); 6205 6206 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6207 // converted into select instructions. We require N - 1 selects per phi 6208 // node, where N is the number of incoming values. 6209 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6210 return (Phi->getNumIncomingValues() - 1) * 6211 TTI.getCmpSelInstrCost( 6212 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6213 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); 6214 6215 return TTI.getCFInstrCost(Instruction::PHI); 6216 } 6217 case Instruction::UDiv: 6218 case Instruction::SDiv: 6219 case Instruction::URem: 6220 case Instruction::SRem: 6221 // If we have a predicated instruction, it may not be executed for each 6222 // vector lane. Get the scalarization cost and scale this amount by the 6223 // probability of executing the predicated block. If the instruction is not 6224 // predicated, we fall through to the next case. 6225 if (VF > 1 && isScalarWithPredication(I)) { 6226 unsigned Cost = 0; 6227 6228 // These instructions have a non-void type, so account for the phi nodes 6229 // that we will create. This cost is likely to be zero. The phi node 6230 // cost, if any, should be scaled by the block probability because it 6231 // models a copy at the end of each predicated block. 6232 Cost += VF * TTI.getCFInstrCost(Instruction::PHI); 6233 6234 // The cost of the non-predicated instruction. 6235 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); 6236 6237 // The cost of insertelement and extractelement instructions needed for 6238 // scalarization. 6239 Cost += getScalarizationOverhead(I, VF); 6240 6241 // Scale the cost by the probability of executing the predicated blocks. 6242 // This assumes the predicated block for each vector lane is equally 6243 // likely. 6244 return Cost / getReciprocalPredBlockProb(); 6245 } 6246 LLVM_FALLTHROUGH; 6247 case Instruction::Add: 6248 case Instruction::FAdd: 6249 case Instruction::Sub: 6250 case Instruction::FSub: 6251 case Instruction::Mul: 6252 case Instruction::FMul: 6253 case Instruction::FDiv: 6254 case Instruction::FRem: 6255 case Instruction::Shl: 6256 case Instruction::LShr: 6257 case Instruction::AShr: 6258 case Instruction::And: 6259 case Instruction::Or: 6260 case Instruction::Xor: { 6261 // Since we will replace the stride by 1 the multiplication should go away. 6262 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6263 return 0; 6264 // Certain instructions can be cheaper to vectorize if they have a constant 6265 // second vector operand. One example of this are shifts on x86. 6266 Value *Op2 = I->getOperand(1); 6267 TargetTransformInfo::OperandValueProperties Op2VP; 6268 TargetTransformInfo::OperandValueKind Op2VK = 6269 TTI.getOperandInfo(Op2, Op2VP); 6270 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6271 Op2VK = TargetTransformInfo::OK_UniformValue; 6272 6273 SmallVector<const Value *, 4> Operands(I->operand_values()); 6274 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6275 return N * TTI.getArithmeticInstrCost( 6276 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6277 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6278 } 6279 case Instruction::FNeg: { 6280 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6281 return N * TTI.getArithmeticInstrCost( 6282 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6283 TargetTransformInfo::OK_AnyValue, 6284 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6285 I->getOperand(0), I); 6286 } 6287 case Instruction::Select: { 6288 SelectInst *SI = cast<SelectInst>(I); 6289 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6290 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6291 Type *CondTy = SI->getCondition()->getType(); 6292 if (!ScalarCond) 6293 CondTy = VectorType::get(CondTy, VF); 6294 6295 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); 6296 } 6297 case Instruction::ICmp: 6298 case Instruction::FCmp: { 6299 Type *ValTy = I->getOperand(0)->getType(); 6300 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6301 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6302 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6303 VectorTy = ToVectorTy(ValTy, VF); 6304 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); 6305 } 6306 case Instruction::Store: 6307 case Instruction::Load: { 6308 unsigned Width = VF; 6309 if (Width > 1) { 6310 InstWidening Decision = getWideningDecision(I, Width); 6311 assert(Decision != CM_Unknown && 6312 "CM decision should be taken at this point"); 6313 if (Decision == CM_Scalarize) 6314 Width = 1; 6315 } 6316 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6317 return getMemoryInstructionCost(I, VF); 6318 } 6319 case Instruction::ZExt: 6320 case Instruction::SExt: 6321 case Instruction::FPToUI: 6322 case Instruction::FPToSI: 6323 case Instruction::FPExt: 6324 case Instruction::PtrToInt: 6325 case Instruction::IntToPtr: 6326 case Instruction::SIToFP: 6327 case Instruction::UIToFP: 6328 case Instruction::Trunc: 6329 case Instruction::FPTrunc: 6330 case Instruction::BitCast: { 6331 // We optimize the truncation of induction variables having constant 6332 // integer steps. The cost of these truncations is the same as the scalar 6333 // operation. 6334 if (isOptimizableIVTruncate(I, VF)) { 6335 auto *Trunc = cast<TruncInst>(I); 6336 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6337 Trunc->getSrcTy(), Trunc); 6338 } 6339 6340 Type *SrcScalarTy = I->getOperand(0)->getType(); 6341 Type *SrcVecTy = 6342 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6343 if (canTruncateToMinimalBitwidth(I, VF)) { 6344 // This cast is going to be shrunk. This may remove the cast or it might 6345 // turn it into slightly different cast. For example, if MinBW == 16, 6346 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6347 // 6348 // Calculate the modified src and dest types. 6349 Type *MinVecTy = VectorTy; 6350 if (I->getOpcode() == Instruction::Trunc) { 6351 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6352 VectorTy = 6353 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6354 } else if (I->getOpcode() == Instruction::ZExt || 6355 I->getOpcode() == Instruction::SExt) { 6356 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6357 VectorTy = 6358 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6359 } 6360 } 6361 6362 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6363 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); 6364 } 6365 case Instruction::Call: { 6366 bool NeedToScalarize; 6367 CallInst *CI = cast<CallInst>(I); 6368 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6369 if (getVectorIntrinsicIDForCall(CI, TLI)) 6370 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6371 return CallCost; 6372 } 6373 default: 6374 // The cost of executing VF copies of the scalar instruction. This opcode 6375 // is unknown. Assume that it is the same as 'mul'. 6376 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + 6377 getScalarizationOverhead(I, VF); 6378 } // end of switch. 6379 } 6380 6381 char LoopVectorize::ID = 0; 6382 6383 static const char lv_name[] = "Loop Vectorization"; 6384 6385 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6386 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6387 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6388 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6389 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6390 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6391 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6392 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6393 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6394 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6395 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6396 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6397 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6398 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6399 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6400 6401 namespace llvm { 6402 6403 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6404 6405 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6406 bool VectorizeOnlyWhenForced) { 6407 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6408 } 6409 6410 } // end namespace llvm 6411 6412 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6413 // Check if the pointer operand of a load or store instruction is 6414 // consecutive. 6415 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6416 return Legal->isConsecutivePtr(Ptr); 6417 return false; 6418 } 6419 6420 void LoopVectorizationCostModel::collectValuesToIgnore() { 6421 // Ignore ephemeral values. 6422 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6423 6424 // Ignore type-promoting instructions we identified during reduction 6425 // detection. 6426 for (auto &Reduction : *Legal->getReductionVars()) { 6427 RecurrenceDescriptor &RedDes = Reduction.second; 6428 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6429 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6430 } 6431 // Ignore type-casting instructions we identified during induction 6432 // detection. 6433 for (auto &Induction : *Legal->getInductionVars()) { 6434 InductionDescriptor &IndDes = Induction.second; 6435 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6436 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6437 } 6438 } 6439 6440 // TODO: we could return a pair of values that specify the max VF and 6441 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6442 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6443 // doesn't have a cost model that can choose which plan to execute if 6444 // more than one is generated. 6445 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6446 LoopVectorizationCostModel &CM) { 6447 unsigned WidestType; 6448 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6449 return WidestVectorRegBits / WidestType; 6450 } 6451 6452 VectorizationFactor 6453 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6454 unsigned VF = UserVF; 6455 // Outer loop handling: They may require CFG and instruction level 6456 // transformations before even evaluating whether vectorization is profitable. 6457 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6458 // the vectorization pipeline. 6459 if (!OrigLoop->empty()) { 6460 // If the user doesn't provide a vectorization factor, determine a 6461 // reasonable one. 6462 if (!UserVF) { 6463 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6464 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6465 6466 // Make sure we have a VF > 1 for stress testing. 6467 if (VPlanBuildStressTest && VF < 2) { 6468 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6469 << "overriding computed VF.\n"); 6470 VF = 4; 6471 } 6472 } 6473 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6474 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6475 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6476 << " to build VPlans.\n"); 6477 buildVPlans(VF, VF); 6478 6479 // For VPlan build stress testing, we bail out after VPlan construction. 6480 if (VPlanBuildStressTest) 6481 return VectorizationFactor::Disabled(); 6482 6483 return {VF, 0}; 6484 } 6485 6486 LLVM_DEBUG( 6487 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6488 "VPlan-native path.\n"); 6489 return VectorizationFactor::Disabled(); 6490 } 6491 6492 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { 6493 assert(OrigLoop->empty() && "Inner loop expected."); 6494 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); 6495 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6496 return None; 6497 6498 // Invalidate interleave groups if all blocks of loop will be predicated. 6499 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6500 !useMaskedInterleavedAccesses(*TTI)) { 6501 LLVM_DEBUG( 6502 dbgs() 6503 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6504 "which requires masked-interleaved support.\n"); 6505 CM.InterleaveInfo.reset(); 6506 } 6507 6508 if (UserVF) { 6509 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6510 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6511 // Collect the instructions (and their associated costs) that will be more 6512 // profitable to scalarize. 6513 CM.selectUserVectorizationFactor(UserVF); 6514 buildVPlansWithVPRecipes(UserVF, UserVF); 6515 LLVM_DEBUG(printPlans(dbgs())); 6516 return {{UserVF, 0}}; 6517 } 6518 6519 unsigned MaxVF = MaybeMaxVF.getValue(); 6520 assert(MaxVF != 0 && "MaxVF is zero."); 6521 6522 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6523 // Collect Uniform and Scalar instructions after vectorization with VF. 6524 CM.collectUniformsAndScalars(VF); 6525 6526 // Collect the instructions (and their associated costs) that will be more 6527 // profitable to scalarize. 6528 if (VF > 1) 6529 CM.collectInstsToScalarize(VF); 6530 } 6531 6532 buildVPlansWithVPRecipes(1, MaxVF); 6533 LLVM_DEBUG(printPlans(dbgs())); 6534 if (MaxVF == 1) 6535 return VectorizationFactor::Disabled(); 6536 6537 // Select the optimal vectorization factor. 6538 return CM.selectVectorizationFactor(MaxVF); 6539 } 6540 6541 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6542 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6543 << '\n'); 6544 BestVF = VF; 6545 BestUF = UF; 6546 6547 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6548 return !Plan->hasVF(VF); 6549 }); 6550 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6551 } 6552 6553 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6554 DominatorTree *DT) { 6555 // Perform the actual loop transformation. 6556 6557 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6558 VPCallbackILV CallbackILV(ILV); 6559 6560 VPTransformState State{BestVF, BestUF, LI, 6561 DT, ILV.Builder, ILV.VectorLoopValueMap, 6562 &ILV, CallbackILV}; 6563 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6564 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6565 6566 //===------------------------------------------------===// 6567 // 6568 // Notice: any optimization or new instruction that go 6569 // into the code below should also be implemented in 6570 // the cost-model. 6571 // 6572 //===------------------------------------------------===// 6573 6574 // 2. Copy and widen instructions from the old loop into the new loop. 6575 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6576 VPlans.front()->execute(&State); 6577 6578 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6579 // predication, updating analyses. 6580 ILV.fixVectorizedLoop(); 6581 } 6582 6583 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6584 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6585 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6586 6587 // We create new control-flow for the vectorized loop, so the original 6588 // condition will be dead after vectorization if it's only used by the 6589 // branch. 6590 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6591 if (Cmp && Cmp->hasOneUse()) 6592 DeadInstructions.insert(Cmp); 6593 6594 // We create new "steps" for induction variable updates to which the original 6595 // induction variables map. An original update instruction will be dead if 6596 // all its users except the induction variable are dead. 6597 for (auto &Induction : *Legal->getInductionVars()) { 6598 PHINode *Ind = Induction.first; 6599 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6600 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6601 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != 6602 DeadInstructions.end(); 6603 })) 6604 DeadInstructions.insert(IndUpdate); 6605 6606 // We record as "Dead" also the type-casting instructions we had identified 6607 // during induction analysis. We don't need any handling for them in the 6608 // vectorized loop because we have proven that, under a proper runtime 6609 // test guarding the vectorized loop, the value of the phi, and the casted 6610 // value of the phi, are the same. The last instruction in this casting chain 6611 // will get its scalar/vector/widened def from the scalar/vector/widened def 6612 // of the respective phi node. Any other casts in the induction def-use chain 6613 // have no other uses outside the phi update chain, and will be ignored. 6614 InductionDescriptor &IndDes = Induction.second; 6615 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6616 DeadInstructions.insert(Casts.begin(), Casts.end()); 6617 } 6618 } 6619 6620 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6621 6622 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6623 6624 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6625 Instruction::BinaryOps BinOp) { 6626 // When unrolling and the VF is 1, we only need to add a simple scalar. 6627 Type *Ty = Val->getType(); 6628 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6629 6630 if (Ty->isFloatingPointTy()) { 6631 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6632 6633 // Floating point operations had to be 'fast' to enable the unrolling. 6634 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6635 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6636 } 6637 Constant *C = ConstantInt::get(Ty, StartIdx); 6638 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6639 } 6640 6641 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6642 SmallVector<Metadata *, 4> MDs; 6643 // Reserve first location for self reference to the LoopID metadata node. 6644 MDs.push_back(nullptr); 6645 bool IsUnrollMetadata = false; 6646 MDNode *LoopID = L->getLoopID(); 6647 if (LoopID) { 6648 // First find existing loop unrolling disable metadata. 6649 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6650 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6651 if (MD) { 6652 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6653 IsUnrollMetadata = 6654 S && S->getString().startswith("llvm.loop.unroll.disable"); 6655 } 6656 MDs.push_back(LoopID->getOperand(i)); 6657 } 6658 } 6659 6660 if (!IsUnrollMetadata) { 6661 // Add runtime unroll disable metadata. 6662 LLVMContext &Context = L->getHeader()->getContext(); 6663 SmallVector<Metadata *, 1> DisableOperands; 6664 DisableOperands.push_back( 6665 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6666 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6667 MDs.push_back(DisableNode); 6668 MDNode *NewLoopID = MDNode::get(Context, MDs); 6669 // Set operand 0 to refer to the loop id itself. 6670 NewLoopID->replaceOperandWith(0, NewLoopID); 6671 L->setLoopID(NewLoopID); 6672 } 6673 } 6674 6675 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6676 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6677 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6678 bool PredicateAtRangeStart = Predicate(Range.Start); 6679 6680 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6681 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6682 Range.End = TmpVF; 6683 break; 6684 } 6685 6686 return PredicateAtRangeStart; 6687 } 6688 6689 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6690 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6691 /// of VF's starting at a given VF and extending it as much as possible. Each 6692 /// vectorization decision can potentially shorten this sub-range during 6693 /// buildVPlan(). 6694 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6695 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6696 VFRange SubRange = {VF, MaxVF + 1}; 6697 VPlans.push_back(buildVPlan(SubRange)); 6698 VF = SubRange.End; 6699 } 6700 } 6701 6702 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6703 VPlanPtr &Plan) { 6704 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6705 6706 // Look for cached value. 6707 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6708 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6709 if (ECEntryIt != EdgeMaskCache.end()) 6710 return ECEntryIt->second; 6711 6712 VPValue *SrcMask = createBlockInMask(Src, Plan); 6713 6714 // The terminator has to be a branch inst! 6715 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6716 assert(BI && "Unexpected terminator found"); 6717 6718 if (!BI->isConditional()) 6719 return EdgeMaskCache[Edge] = SrcMask; 6720 6721 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6722 assert(EdgeMask && "No Edge Mask found for condition"); 6723 6724 if (BI->getSuccessor(0) != Dst) 6725 EdgeMask = Builder.createNot(EdgeMask); 6726 6727 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6728 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6729 6730 return EdgeMaskCache[Edge] = EdgeMask; 6731 } 6732 6733 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6734 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6735 6736 // Look for cached value. 6737 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6738 if (BCEntryIt != BlockMaskCache.end()) 6739 return BCEntryIt->second; 6740 6741 // All-one mask is modelled as no-mask following the convention for masked 6742 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6743 VPValue *BlockMask = nullptr; 6744 6745 if (OrigLoop->getHeader() == BB) { 6746 if (!CM.blockNeedsPredication(BB)) 6747 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6748 6749 // Introduce the early-exit compare IV <= BTC to form header block mask. 6750 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6751 VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6752 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6753 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6754 return BlockMaskCache[BB] = BlockMask; 6755 } 6756 6757 // This is the block mask. We OR all incoming edges. 6758 for (auto *Predecessor : predecessors(BB)) { 6759 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6760 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6761 return BlockMaskCache[BB] = EdgeMask; 6762 6763 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6764 BlockMask = EdgeMask; 6765 continue; 6766 } 6767 6768 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6769 } 6770 6771 return BlockMaskCache[BB] = BlockMask; 6772 } 6773 6774 VPWidenMemoryInstructionRecipe * 6775 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6776 VPlanPtr &Plan) { 6777 if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) 6778 return nullptr; 6779 6780 auto willWiden = [&](unsigned VF) -> bool { 6781 if (VF == 1) 6782 return false; 6783 LoopVectorizationCostModel::InstWidening Decision = 6784 CM.getWideningDecision(I, VF); 6785 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6786 "CM decision should be taken at this point."); 6787 if (Decision == LoopVectorizationCostModel::CM_Interleave) 6788 return true; 6789 if (CM.isScalarAfterVectorization(I, VF) || 6790 CM.isProfitableToScalarize(I, VF)) 6791 return false; 6792 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6793 }; 6794 6795 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6796 return nullptr; 6797 6798 VPValue *Mask = nullptr; 6799 if (Legal->isMaskRequired(I)) 6800 Mask = createBlockInMask(I->getParent(), Plan); 6801 6802 return new VPWidenMemoryInstructionRecipe(*I, Mask); 6803 } 6804 6805 VPWidenIntOrFpInductionRecipe * 6806 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { 6807 if (PHINode *Phi = dyn_cast<PHINode>(I)) { 6808 // Check if this is an integer or fp induction. If so, build the recipe that 6809 // produces its scalar and vector values. 6810 InductionDescriptor II = Legal->getInductionVars()->lookup(Phi); 6811 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6812 II.getKind() == InductionDescriptor::IK_FpInduction) 6813 return new VPWidenIntOrFpInductionRecipe(Phi); 6814 6815 return nullptr; 6816 } 6817 6818 // Optimize the special case where the source is a constant integer 6819 // induction variable. Notice that we can only optimize the 'trunc' case 6820 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6821 // (c) other casts depend on pointer size. 6822 6823 // Determine whether \p K is a truncation based on an induction variable that 6824 // can be optimized. 6825 auto isOptimizableIVTruncate = 6826 [&](Instruction *K) -> std::function<bool(unsigned)> { 6827 return 6828 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6829 }; 6830 6831 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( 6832 isOptimizableIVTruncate(I), Range)) 6833 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6834 cast<TruncInst>(I)); 6835 return nullptr; 6836 } 6837 6838 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { 6839 PHINode *Phi = dyn_cast<PHINode>(I); 6840 if (!Phi || Phi->getParent() == OrigLoop->getHeader()) 6841 return nullptr; 6842 6843 // We know that all PHIs in non-header blocks are converted into selects, so 6844 // we don't have to worry about the insertion order and we can just use the 6845 // builder. At this point we generate the predication tree. There may be 6846 // duplications since this is a simple recursive scan, but future 6847 // optimizations will clean it up. 6848 6849 SmallVector<VPValue *, 2> Masks; 6850 unsigned NumIncoming = Phi->getNumIncomingValues(); 6851 for (unsigned In = 0; In < NumIncoming; In++) { 6852 VPValue *EdgeMask = 6853 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6854 assert((EdgeMask || NumIncoming == 1) && 6855 "Multiple predecessors with one having a full mask"); 6856 if (EdgeMask) 6857 Masks.push_back(EdgeMask); 6858 } 6859 return new VPBlendRecipe(Phi, Masks); 6860 } 6861 6862 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, 6863 VFRange &Range) { 6864 6865 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6866 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6867 6868 if (IsPredicated) 6869 return false; 6870 6871 auto IsVectorizableOpcode = [](unsigned Opcode) { 6872 switch (Opcode) { 6873 case Instruction::Add: 6874 case Instruction::And: 6875 case Instruction::AShr: 6876 case Instruction::BitCast: 6877 case Instruction::Br: 6878 case Instruction::Call: 6879 case Instruction::FAdd: 6880 case Instruction::FCmp: 6881 case Instruction::FDiv: 6882 case Instruction::FMul: 6883 case Instruction::FNeg: 6884 case Instruction::FPExt: 6885 case Instruction::FPToSI: 6886 case Instruction::FPToUI: 6887 case Instruction::FPTrunc: 6888 case Instruction::FRem: 6889 case Instruction::FSub: 6890 case Instruction::ICmp: 6891 case Instruction::IntToPtr: 6892 case Instruction::Load: 6893 case Instruction::LShr: 6894 case Instruction::Mul: 6895 case Instruction::Or: 6896 case Instruction::PHI: 6897 case Instruction::PtrToInt: 6898 case Instruction::SDiv: 6899 case Instruction::Select: 6900 case Instruction::SExt: 6901 case Instruction::Shl: 6902 case Instruction::SIToFP: 6903 case Instruction::SRem: 6904 case Instruction::Store: 6905 case Instruction::Sub: 6906 case Instruction::Trunc: 6907 case Instruction::UDiv: 6908 case Instruction::UIToFP: 6909 case Instruction::URem: 6910 case Instruction::Xor: 6911 case Instruction::ZExt: 6912 return true; 6913 } 6914 return false; 6915 }; 6916 6917 if (!IsVectorizableOpcode(I->getOpcode())) 6918 return false; 6919 6920 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6921 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6922 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6923 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6924 return false; 6925 } 6926 6927 auto willWiden = [&](unsigned VF) -> bool { 6928 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || 6929 CM.isProfitableToScalarize(I, VF))) 6930 return false; 6931 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6932 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6933 // The following case may be scalarized depending on the VF. 6934 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6935 // version of the instruction. 6936 // Is it beneficial to perform intrinsic call compared to lib call? 6937 bool NeedToScalarize; 6938 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6939 bool UseVectorIntrinsic = 6940 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6941 return UseVectorIntrinsic || !NeedToScalarize; 6942 } 6943 if (isa<LoadInst>(I) || isa<StoreInst>(I)) { 6944 assert(CM.getWideningDecision(I, VF) == 6945 LoopVectorizationCostModel::CM_Scalarize && 6946 "Memory widening decisions should have been taken care by now"); 6947 return false; 6948 } 6949 return true; 6950 }; 6951 6952 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6953 return false; 6954 // If this ingredient's recipe is to be recorded, keep its recipe a singleton 6955 // to avoid having to split recipes later. 6956 bool IsSingleton = Ingredient2Recipe.count(I); 6957 6958 // Success: widen this instruction. 6959 6960 // Use the default widening recipe. We optimize the common case where 6961 // consecutive instructions can be represented by a single recipe. 6962 if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() && 6963 LastExtensibleRecipe->appendInstruction(I)) 6964 return true; 6965 6966 VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I); 6967 if (!IsSingleton) 6968 LastExtensibleRecipe = WidenRecipe; 6969 setRecipe(I, WidenRecipe); 6970 VPBB->appendRecipe(WidenRecipe); 6971 return true; 6972 } 6973 6974 VPBasicBlock *VPRecipeBuilder::handleReplication( 6975 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 6976 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 6977 VPlanPtr &Plan) { 6978 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 6979 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 6980 Range); 6981 6982 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6983 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6984 6985 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); 6986 setRecipe(I, Recipe); 6987 6988 // Find if I uses a predicated instruction. If so, it will use its scalar 6989 // value. Avoid hoisting the insert-element which packs the scalar value into 6990 // a vector value, as that happens iff all users use the vector value. 6991 for (auto &Op : I->operands()) 6992 if (auto *PredInst = dyn_cast<Instruction>(Op)) 6993 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 6994 PredInst2Recipe[PredInst]->setAlsoPack(false); 6995 6996 // Finalize the recipe for Instr, first if it is not predicated. 6997 if (!IsPredicated) { 6998 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 6999 VPBB->appendRecipe(Recipe); 7000 return VPBB; 7001 } 7002 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7003 assert(VPBB->getSuccessors().empty() && 7004 "VPBB has successors when handling predicated replication."); 7005 // Record predicated instructions for above packing optimizations. 7006 PredInst2Recipe[I] = Recipe; 7007 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7008 VPBlockUtils::insertBlockAfter(Region, VPBB); 7009 auto *RegSucc = new VPBasicBlock(); 7010 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7011 return RegSucc; 7012 } 7013 7014 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7015 VPRecipeBase *PredRecipe, 7016 VPlanPtr &Plan) { 7017 // Instructions marked for predication are replicated and placed under an 7018 // if-then construct to prevent side-effects. 7019 7020 // Generate recipes to compute the block mask for this region. 7021 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7022 7023 // Build the triangular if-then region. 7024 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7025 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7026 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7027 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7028 auto *PHIRecipe = 7029 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7030 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7031 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7032 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7033 7034 // Note: first set Entry as region entry and then connect successors starting 7035 // from it in order, to propagate the "parent" of each VPBasicBlock. 7036 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7037 VPBlockUtils::connectBlocks(Pred, Exit); 7038 7039 return Region; 7040 } 7041 7042 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, 7043 VPlanPtr &Plan, VPBasicBlock *VPBB) { 7044 VPRecipeBase *Recipe = nullptr; 7045 7046 // First, check for specific widening recipes that deal with memory 7047 // operations, inductions and Phi nodes. 7048 if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) || 7049 (Recipe = tryToOptimizeInduction(Instr, Range)) || 7050 (Recipe = tryToBlend(Instr, Plan)) || 7051 (isa<PHINode>(Instr) && 7052 (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) { 7053 setRecipe(Instr, Recipe); 7054 VPBB->appendRecipe(Recipe); 7055 return true; 7056 } 7057 7058 // Handle GEP widening. 7059 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { 7060 auto Scalarize = [&](unsigned VF) { 7061 return CM.isScalarWithPredication(Instr, VF) || 7062 CM.isScalarAfterVectorization(Instr, VF) || 7063 CM.isProfitableToScalarize(Instr, VF); 7064 }; 7065 if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range)) 7066 return false; 7067 VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop); 7068 setRecipe(Instr, Recipe); 7069 VPBB->appendRecipe(Recipe); 7070 return true; 7071 } 7072 7073 // Check if Instr is to be widened by a general VPWidenRecipe, after 7074 // having first checked for specific widening recipes. 7075 if (tryToWiden(Instr, VPBB, Range)) 7076 return true; 7077 7078 return false; 7079 } 7080 7081 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7082 unsigned MaxVF) { 7083 assert(OrigLoop->empty() && "Inner loop expected."); 7084 7085 // Collect conditions feeding internal conditional branches; they need to be 7086 // represented in VPlan for it to model masking. 7087 SmallPtrSet<Value *, 1> NeedDef; 7088 7089 auto *Latch = OrigLoop->getLoopLatch(); 7090 for (BasicBlock *BB : OrigLoop->blocks()) { 7091 if (BB == Latch) 7092 continue; 7093 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7094 if (Branch && Branch->isConditional()) 7095 NeedDef.insert(Branch->getCondition()); 7096 } 7097 7098 // If the tail is to be folded by masking, the primary induction variable 7099 // needs to be represented in VPlan for it to model early-exit masking. 7100 // Also, both the Phi and the live-out instruction of each reduction are 7101 // required in order to introduce a select between them in VPlan. 7102 if (CM.foldTailByMasking()) { 7103 NeedDef.insert(Legal->getPrimaryInduction()); 7104 for (auto &Reduction : *Legal->getReductionVars()) { 7105 NeedDef.insert(Reduction.first); 7106 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7107 } 7108 } 7109 7110 // Collect instructions from the original loop that will become trivially dead 7111 // in the vectorized loop. We don't need to vectorize these instructions. For 7112 // example, original induction update instructions can become dead because we 7113 // separately emit induction "steps" when generating code for the new loop. 7114 // Similarly, we create a new latch condition when setting up the structure 7115 // of the new loop, so the old one can become dead. 7116 SmallPtrSet<Instruction *, 4> DeadInstructions; 7117 collectTriviallyDeadInstructions(DeadInstructions); 7118 7119 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7120 VFRange SubRange = {VF, MaxVF + 1}; 7121 VPlans.push_back( 7122 buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions)); 7123 VF = SubRange.End; 7124 } 7125 } 7126 7127 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7128 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7129 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7130 7131 // Hold a mapping from predicated instructions to their recipes, in order to 7132 // fix their AlsoPack behavior if a user is determined to replicate and use a 7133 // scalar instead of vector value. 7134 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7135 7136 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7137 7138 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7139 7140 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); 7141 7142 // --------------------------------------------------------------------------- 7143 // Pre-construction: record ingredients whose recipes we'll need to further 7144 // process after constructing the initial VPlan. 7145 // --------------------------------------------------------------------------- 7146 7147 // Mark instructions we'll need to sink later and their targets as 7148 // ingredients whose recipe we'll need to record. 7149 for (auto &Entry : SinkAfter) { 7150 RecipeBuilder.recordRecipeOf(Entry.first); 7151 RecipeBuilder.recordRecipeOf(Entry.second); 7152 } 7153 7154 // For each interleave group which is relevant for this (possibly trimmed) 7155 // Range, add it to the set of groups to be later applied to the VPlan and add 7156 // placeholders for its members' Recipes which we'll be replacing with a 7157 // single VPInterleaveRecipe. 7158 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7159 auto applyIG = [IG, this](unsigned VF) -> bool { 7160 return (VF >= 2 && // Query is illegal for VF == 1 7161 CM.getWideningDecision(IG->getInsertPos(), VF) == 7162 LoopVectorizationCostModel::CM_Interleave); 7163 }; 7164 if (!getDecisionAndClampRange(applyIG, Range)) 7165 continue; 7166 InterleaveGroups.insert(IG); 7167 for (unsigned i = 0; i < IG->getFactor(); i++) 7168 if (Instruction *Member = IG->getMember(i)) 7169 RecipeBuilder.recordRecipeOf(Member); 7170 }; 7171 7172 // --------------------------------------------------------------------------- 7173 // Build initial VPlan: Scan the body of the loop in a topological order to 7174 // visit each basic block after having visited its predecessor basic blocks. 7175 // --------------------------------------------------------------------------- 7176 7177 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7178 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7179 auto Plan = std::make_unique<VPlan>(VPBB); 7180 7181 // Represent values that will have defs inside VPlan. 7182 for (Value *V : NeedDef) 7183 Plan->addVPValue(V); 7184 7185 // Scan the body of the loop in a topological order to visit each basic block 7186 // after having visited its predecessor basic blocks. 7187 LoopBlocksDFS DFS(OrigLoop); 7188 DFS.perform(LI); 7189 7190 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7191 // Relevant instructions from basic block BB will be grouped into VPRecipe 7192 // ingredients and fill a new VPBasicBlock. 7193 unsigned VPBBsForBB = 0; 7194 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7195 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7196 VPBB = FirstVPBBForBB; 7197 Builder.setInsertPoint(VPBB); 7198 7199 // Introduce each ingredient into VPlan. 7200 for (Instruction &I : BB->instructionsWithoutDebug()) { 7201 Instruction *Instr = &I; 7202 7203 // First filter out irrelevant instructions, to ensure no recipes are 7204 // built for them. 7205 if (isa<BranchInst>(Instr) || 7206 DeadInstructions.find(Instr) != DeadInstructions.end()) 7207 continue; 7208 7209 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) 7210 continue; 7211 7212 // Otherwise, if all widening options failed, Instruction is to be 7213 // replicated. This may create a successor for VPBB. 7214 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7215 Instr, Range, VPBB, PredInst2Recipe, Plan); 7216 if (NextVPBB != VPBB) { 7217 VPBB = NextVPBB; 7218 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7219 : ""); 7220 } 7221 } 7222 } 7223 7224 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7225 // may also be empty, such as the last one VPBB, reflecting original 7226 // basic-blocks with no recipes. 7227 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7228 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7229 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7230 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7231 delete PreEntry; 7232 7233 // --------------------------------------------------------------------------- 7234 // Transform initial VPlan: Apply previously taken decisions, in order, to 7235 // bring the VPlan to its final state. 7236 // --------------------------------------------------------------------------- 7237 7238 // Apply Sink-After legal constraints. 7239 for (auto &Entry : SinkAfter) { 7240 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7241 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7242 Sink->moveAfter(Target); 7243 } 7244 7245 // Interleave memory: for each Interleave Group we marked earlier as relevant 7246 // for this VPlan, replace the Recipes widening its memory instructions with a 7247 // single VPInterleaveRecipe at its insertion point. 7248 for (auto IG : InterleaveGroups) { 7249 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7250 RecipeBuilder.getRecipe(IG->getInsertPos())); 7251 (new VPInterleaveRecipe(IG, Recipe->getMask()))->insertBefore(Recipe); 7252 7253 for (unsigned i = 0; i < IG->getFactor(); ++i) 7254 if (Instruction *Member = IG->getMember(i)) { 7255 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7256 } 7257 } 7258 7259 // Finally, if tail is folded by masking, introduce selects between the phi 7260 // and the live-out instruction of each reduction, at the end of the latch. 7261 if (CM.foldTailByMasking()) { 7262 Builder.setInsertPoint(VPBB); 7263 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7264 for (auto &Reduction : *Legal->getReductionVars()) { 7265 VPValue *Phi = Plan->getVPValue(Reduction.first); 7266 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7267 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7268 } 7269 } 7270 7271 std::string PlanName; 7272 raw_string_ostream RSO(PlanName); 7273 unsigned VF = Range.Start; 7274 Plan->addVF(VF); 7275 RSO << "Initial VPlan for VF={" << VF; 7276 for (VF *= 2; VF < Range.End; VF *= 2) { 7277 Plan->addVF(VF); 7278 RSO << "," << VF; 7279 } 7280 RSO << "},UF>=1"; 7281 RSO.flush(); 7282 Plan->setName(PlanName); 7283 7284 return Plan; 7285 } 7286 7287 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7288 // Outer loop handling: They may require CFG and instruction level 7289 // transformations before even evaluating whether vectorization is profitable. 7290 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7291 // the vectorization pipeline. 7292 assert(!OrigLoop->empty()); 7293 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7294 7295 // Create new empty VPlan 7296 auto Plan = std::make_unique<VPlan>(); 7297 7298 // Build hierarchical CFG 7299 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7300 HCFGBuilder.buildHierarchicalCFG(); 7301 7302 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7303 Plan->addVF(VF); 7304 7305 if (EnableVPlanPredication) { 7306 VPlanPredicator VPP(*Plan); 7307 VPP.predicate(); 7308 7309 // Avoid running transformation to recipes until masked code generation in 7310 // VPlan-native path is in place. 7311 return Plan; 7312 } 7313 7314 SmallPtrSet<Instruction *, 1> DeadInstructions; 7315 VPlanTransforms::VPInstructionsToVPRecipes( 7316 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7317 return Plan; 7318 } 7319 7320 Value* LoopVectorizationPlanner::VPCallbackILV:: 7321 getOrCreateVectorValues(Value *V, unsigned Part) { 7322 return ILV.getOrCreateVectorValue(V, Part); 7323 } 7324 7325 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { 7326 O << " +\n" 7327 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7328 IG->getInsertPos()->printAsOperand(O, false); 7329 if (User) { 7330 O << ", "; 7331 User->getOperand(0)->printAsOperand(O); 7332 } 7333 O << "\\l\""; 7334 for (unsigned i = 0; i < IG->getFactor(); ++i) 7335 if (Instruction *I = IG->getMember(i)) 7336 O << " +\n" 7337 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; 7338 } 7339 7340 void VPWidenRecipe::execute(VPTransformState &State) { 7341 for (auto &Instr : make_range(Begin, End)) 7342 State.ILV->widenInstruction(Instr); 7343 } 7344 7345 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7346 State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, 7347 IsIndexLoopInvariant); 7348 } 7349 7350 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7351 assert(!State.Instance && "Int or FP induction being replicated."); 7352 State.ILV->widenIntOrFpInduction(IV, Trunc); 7353 } 7354 7355 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7356 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7357 } 7358 7359 void VPBlendRecipe::execute(VPTransformState &State) { 7360 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7361 // We know that all PHIs in non-header blocks are converted into 7362 // selects, so we don't have to worry about the insertion order and we 7363 // can just use the builder. 7364 // At this point we generate the predication tree. There may be 7365 // duplications since this is a simple recursive scan, but future 7366 // optimizations will clean it up. 7367 7368 unsigned NumIncoming = Phi->getNumIncomingValues(); 7369 7370 assert((User || NumIncoming == 1) && 7371 "Multiple predecessors with predecessors having a full mask"); 7372 // Generate a sequence of selects of the form: 7373 // SELECT(Mask3, In3, 7374 // SELECT(Mask2, In2, 7375 // ( ...))) 7376 InnerLoopVectorizer::VectorParts Entry(State.UF); 7377 for (unsigned In = 0; In < NumIncoming; ++In) { 7378 for (unsigned Part = 0; Part < State.UF; ++Part) { 7379 // We might have single edge PHIs (blocks) - use an identity 7380 // 'select' for the first PHI operand. 7381 Value *In0 = 7382 State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part); 7383 if (In == 0) 7384 Entry[Part] = In0; // Initialize with the first incoming value. 7385 else { 7386 // Select between the current value and the previous incoming edge 7387 // based on the incoming mask. 7388 Value *Cond = State.get(User->getOperand(In), Part); 7389 Entry[Part] = 7390 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7391 } 7392 } 7393 } 7394 for (unsigned Part = 0; Part < State.UF; ++Part) 7395 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7396 } 7397 7398 void VPInterleaveRecipe::execute(VPTransformState &State) { 7399 assert(!State.Instance && "Interleave group being replicated."); 7400 if (!User) 7401 return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); 7402 7403 // Last (and currently only) operand is a mask. 7404 InnerLoopVectorizer::VectorParts MaskValues(State.UF); 7405 VPValue *Mask = User->getOperand(User->getNumOperands() - 1); 7406 for (unsigned Part = 0; Part < State.UF; ++Part) 7407 MaskValues[Part] = State.get(Mask, Part); 7408 State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues); 7409 } 7410 7411 void VPReplicateRecipe::execute(VPTransformState &State) { 7412 if (State.Instance) { // Generate a single instance. 7413 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); 7414 // Insert scalar instance packing it into a vector. 7415 if (AlsoPack && State.VF > 1) { 7416 // If we're constructing lane 0, initialize to start from undef. 7417 if (State.Instance->Lane == 0) { 7418 Value *Undef = 7419 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7420 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7421 } 7422 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7423 } 7424 return; 7425 } 7426 7427 // Generate scalar instances for all VF lanes of all UF parts, unless the 7428 // instruction is uniform inwhich case generate only the first lane for each 7429 // of the UF parts. 7430 unsigned EndLane = IsUniform ? 1 : State.VF; 7431 for (unsigned Part = 0; Part < State.UF; ++Part) 7432 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7433 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); 7434 } 7435 7436 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7437 assert(State.Instance && "Branch on Mask works only on single instance."); 7438 7439 unsigned Part = State.Instance->Part; 7440 unsigned Lane = State.Instance->Lane; 7441 7442 Value *ConditionBit = nullptr; 7443 if (!User) // Block in mask is all-one. 7444 ConditionBit = State.Builder.getTrue(); 7445 else { 7446 VPValue *BlockInMask = User->getOperand(0); 7447 ConditionBit = State.get(BlockInMask, Part); 7448 if (ConditionBit->getType()->isVectorTy()) 7449 ConditionBit = State.Builder.CreateExtractElement( 7450 ConditionBit, State.Builder.getInt32(Lane)); 7451 } 7452 7453 // Replace the temporary unreachable terminator with a new conditional branch, 7454 // whose two destinations will be set later when they are created. 7455 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7456 assert(isa<UnreachableInst>(CurrentTerminator) && 7457 "Expected to replace unreachable terminator with conditional branch."); 7458 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7459 CondBr->setSuccessor(0, nullptr); 7460 ReplaceInstWithInst(CurrentTerminator, CondBr); 7461 } 7462 7463 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7464 assert(State.Instance && "Predicated instruction PHI works per instance."); 7465 Instruction *ScalarPredInst = cast<Instruction>( 7466 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7467 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7468 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7469 assert(PredicatingBB && "Predicated block has no single predecessor."); 7470 7471 // By current pack/unpack logic we need to generate only a single phi node: if 7472 // a vector value for the predicated instruction exists at this point it means 7473 // the instruction has vector users only, and a phi for the vector value is 7474 // needed. In this case the recipe of the predicated instruction is marked to 7475 // also do that packing, thereby "hoisting" the insert-element sequence. 7476 // Otherwise, a phi node for the scalar value is needed. 7477 unsigned Part = State.Instance->Part; 7478 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7479 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7480 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7481 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7482 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7483 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7484 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7485 } else { 7486 Type *PredInstType = PredInst->getType(); 7487 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7488 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7489 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7490 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7491 } 7492 } 7493 7494 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7495 VPValue *Mask = getMask(); 7496 if (!Mask) 7497 return State.ILV->vectorizeMemoryInstruction(&Instr); 7498 7499 InnerLoopVectorizer::VectorParts MaskValues(State.UF); 7500 for (unsigned Part = 0; Part < State.UF; ++Part) 7501 MaskValues[Part] = State.get(Mask, Part); 7502 State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); 7503 } 7504 7505 // Determine how to lower the scalar epilogue, which depends on 1) optimising 7506 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 7507 // predication, and 4) a TTI hook that analyses whether the loop is suitable 7508 // for predication. 7509 static ScalarEpilogueLowering getScalarEpilogueLowering( 7510 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 7511 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7512 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 7513 LoopVectorizationLegality &LVL) { 7514 bool OptSize = 7515 F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 7516 PGSOQueryType::IRPass); 7517 // 1) OptSize takes precedence over all other options, i.e. if this is set, 7518 // don't look at hints or options, and don't request a scalar epilogue. 7519 if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) 7520 return CM_ScalarEpilogueNotAllowedOptSize; 7521 7522 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 7523 !PreferPredicateOverEpilog; 7524 7525 // 2) Next, if disabling predication is requested on the command line, honour 7526 // this and request a scalar epilogue. Also do this if we don't have a 7527 // primary induction variable, which is required for predication. 7528 if (PredicateOptDisabled || !LVL.getPrimaryInduction()) 7529 return CM_ScalarEpilogueAllowed; 7530 7531 // 3) and 4) look if enabling predication is requested on the command line, 7532 // with a loop hint, or if the TTI hook indicates this is profitable, request 7533 // predication . 7534 if (PreferPredicateOverEpilog || 7535 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 7536 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 7537 LVL.getLAI()) && 7538 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 7539 return CM_ScalarEpilogueNotNeededUsePredicate; 7540 7541 return CM_ScalarEpilogueAllowed; 7542 } 7543 7544 // Process the loop in the VPlan-native vectorization path. This path builds 7545 // VPlan upfront in the vectorization pipeline, which allows to apply 7546 // VPlan-to-VPlan transformations from the very beginning without modifying the 7547 // input LLVM IR. 7548 static bool processLoopInVPlanNativePath( 7549 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7550 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7551 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7552 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7553 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7554 7555 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7556 Function *F = L->getHeader()->getParent(); 7557 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7558 7559 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7560 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 7561 7562 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7563 &Hints, IAI); 7564 // Use the planner for outer loop vectorization. 7565 // TODO: CM is not used at this point inside the planner. Turn CM into an 7566 // optional argument if we don't need it in the future. 7567 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI); 7568 7569 // Get user vectorization factor. 7570 const unsigned UserVF = Hints.getWidth(); 7571 7572 // Plan how to best vectorize, return the best VF and its cost. 7573 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7574 7575 // If we are stress testing VPlan builds, do not attempt to generate vector 7576 // code. Masked vector code generation support will follow soon. 7577 // Also, do not attempt to vectorize if no vector code will be produced. 7578 if (VPlanBuildStressTest || EnableVPlanPredication || 7579 VectorizationFactor::Disabled() == VF) 7580 return false; 7581 7582 LVP.setBestPlan(VF.Width, 1); 7583 7584 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7585 &CM); 7586 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7587 << L->getHeader()->getParent()->getName() << "\"\n"); 7588 LVP.executePlan(LB, DT); 7589 7590 // Mark the loop as already vectorized to avoid vectorizing again. 7591 Hints.setAlreadyVectorized(); 7592 7593 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7594 return true; 7595 } 7596 7597 bool LoopVectorizePass::processLoop(Loop *L) { 7598 assert((EnableVPlanNativePath || L->empty()) && 7599 "VPlan-native path is not enabled. Only process inner loops."); 7600 7601 #ifndef NDEBUG 7602 const std::string DebugLocStr = getDebugLocString(L); 7603 #endif /* NDEBUG */ 7604 7605 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7606 << L->getHeader()->getParent()->getName() << "\" from " 7607 << DebugLocStr << "\n"); 7608 7609 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7610 7611 LLVM_DEBUG( 7612 dbgs() << "LV: Loop hints:" 7613 << " force=" 7614 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7615 ? "disabled" 7616 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7617 ? "enabled" 7618 : "?")) 7619 << " width=" << Hints.getWidth() 7620 << " unroll=" << Hints.getInterleave() << "\n"); 7621 7622 // Function containing loop 7623 Function *F = L->getHeader()->getParent(); 7624 7625 // Looking at the diagnostic output is the only way to determine if a loop 7626 // was vectorized (other than looking at the IR or machine code), so it 7627 // is important to generate an optimization remark for each loop. Most of 7628 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7629 // generated as OptimizationRemark and OptimizationRemarkMissed are 7630 // less verbose reporting vectorized loops and unvectorized loops that may 7631 // benefit from vectorization, respectively. 7632 7633 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7634 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7635 return false; 7636 } 7637 7638 PredicatedScalarEvolution PSE(*SE, *L); 7639 7640 // Check if it is legal to vectorize the loop. 7641 LoopVectorizationRequirements Requirements(*ORE); 7642 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7643 &Requirements, &Hints, DB, AC); 7644 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7645 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7646 Hints.emitRemarkWithHints(); 7647 return false; 7648 } 7649 7650 // Check the function attributes and profiles to find out if this function 7651 // should be optimized for size. 7652 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7653 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 7654 7655 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7656 // here. They may require CFG and instruction level transformations before 7657 // even evaluating whether vectorization is profitable. Since we cannot modify 7658 // the incoming IR, we need to build VPlan upfront in the vectorization 7659 // pipeline. 7660 if (!L->empty()) 7661 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7662 ORE, BFI, PSI, Hints); 7663 7664 assert(L->empty() && "Inner loop expected."); 7665 7666 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7667 // count by optimizing for size, to minimize overheads. 7668 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 7669 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 7670 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7671 << "This loop is worth vectorizing only if no scalar " 7672 << "iteration overheads are incurred."); 7673 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7674 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7675 else { 7676 LLVM_DEBUG(dbgs() << "\n"); 7677 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7678 } 7679 } 7680 7681 // Check the function attributes to see if implicit floats are allowed. 7682 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7683 // an integer loop and the vector instructions selected are purely integer 7684 // vector instructions? 7685 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7686 reportVectorizationFailure( 7687 "Can't vectorize when the NoImplicitFloat attribute is used", 7688 "loop not vectorized due to NoImplicitFloat attribute", 7689 "NoImplicitFloat", ORE, L); 7690 Hints.emitRemarkWithHints(); 7691 return false; 7692 } 7693 7694 // Check if the target supports potentially unsafe FP vectorization. 7695 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7696 // for the target we're vectorizing for, to make sure none of the 7697 // additional fp-math flags can help. 7698 if (Hints.isPotentiallyUnsafe() && 7699 TTI->isFPVectorizationPotentiallyUnsafe()) { 7700 reportVectorizationFailure( 7701 "Potentially unsafe FP op prevents vectorization", 7702 "loop not vectorized due to unsafe FP support.", 7703 "UnsafeFP", ORE, L); 7704 Hints.emitRemarkWithHints(); 7705 return false; 7706 } 7707 7708 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7709 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7710 7711 // If an override option has been passed in for interleaved accesses, use it. 7712 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7713 UseInterleaved = EnableInterleavedMemAccesses; 7714 7715 // Analyze interleaved memory accesses. 7716 if (UseInterleaved) { 7717 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7718 } 7719 7720 // Use the cost model. 7721 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7722 F, &Hints, IAI); 7723 CM.collectValuesToIgnore(); 7724 7725 // Use the planner for vectorization. 7726 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI); 7727 7728 // Get user vectorization factor. 7729 unsigned UserVF = Hints.getWidth(); 7730 7731 // Plan how to best vectorize, return the best VF and its cost. 7732 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); 7733 7734 VectorizationFactor VF = VectorizationFactor::Disabled(); 7735 unsigned IC = 1; 7736 unsigned UserIC = Hints.getInterleave(); 7737 7738 if (MaybeVF) { 7739 VF = *MaybeVF; 7740 // Select the interleave count. 7741 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7742 } 7743 7744 // Identify the diagnostic messages that should be produced. 7745 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7746 bool VectorizeLoop = true, InterleaveLoop = true; 7747 if (Requirements.doesNotMeet(F, L, Hints)) { 7748 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7749 "requirements.\n"); 7750 Hints.emitRemarkWithHints(); 7751 return false; 7752 } 7753 7754 if (VF.Width == 1) { 7755 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7756 VecDiagMsg = std::make_pair( 7757 "VectorizationNotBeneficial", 7758 "the cost-model indicates that vectorization is not beneficial"); 7759 VectorizeLoop = false; 7760 } 7761 7762 if (!MaybeVF && UserIC > 1) { 7763 // Tell the user interleaving was avoided up-front, despite being explicitly 7764 // requested. 7765 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7766 "interleaving should be avoided up front\n"); 7767 IntDiagMsg = std::make_pair( 7768 "InterleavingAvoided", 7769 "Ignoring UserIC, because interleaving was avoided up front"); 7770 InterleaveLoop = false; 7771 } else if (IC == 1 && UserIC <= 1) { 7772 // Tell the user interleaving is not beneficial. 7773 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7774 IntDiagMsg = std::make_pair( 7775 "InterleavingNotBeneficial", 7776 "the cost-model indicates that interleaving is not beneficial"); 7777 InterleaveLoop = false; 7778 if (UserIC == 1) { 7779 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7780 IntDiagMsg.second += 7781 " and is explicitly disabled or interleave count is set to 1"; 7782 } 7783 } else if (IC > 1 && UserIC == 1) { 7784 // Tell the user interleaving is beneficial, but it explicitly disabled. 7785 LLVM_DEBUG( 7786 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7787 IntDiagMsg = std::make_pair( 7788 "InterleavingBeneficialButDisabled", 7789 "the cost-model indicates that interleaving is beneficial " 7790 "but is explicitly disabled or interleave count is set to 1"); 7791 InterleaveLoop = false; 7792 } 7793 7794 // Override IC if user provided an interleave count. 7795 IC = UserIC > 0 ? UserIC : IC; 7796 7797 // Emit diagnostic messages, if any. 7798 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7799 if (!VectorizeLoop && !InterleaveLoop) { 7800 // Do not vectorize or interleaving the loop. 7801 ORE->emit([&]() { 7802 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7803 L->getStartLoc(), L->getHeader()) 7804 << VecDiagMsg.second; 7805 }); 7806 ORE->emit([&]() { 7807 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7808 L->getStartLoc(), L->getHeader()) 7809 << IntDiagMsg.second; 7810 }); 7811 return false; 7812 } else if (!VectorizeLoop && InterleaveLoop) { 7813 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7814 ORE->emit([&]() { 7815 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7816 L->getStartLoc(), L->getHeader()) 7817 << VecDiagMsg.second; 7818 }); 7819 } else if (VectorizeLoop && !InterleaveLoop) { 7820 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7821 << ") in " << DebugLocStr << '\n'); 7822 ORE->emit([&]() { 7823 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7824 L->getStartLoc(), L->getHeader()) 7825 << IntDiagMsg.second; 7826 }); 7827 } else if (VectorizeLoop && InterleaveLoop) { 7828 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7829 << ") in " << DebugLocStr << '\n'); 7830 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7831 } 7832 7833 LVP.setBestPlan(VF.Width, IC); 7834 7835 using namespace ore; 7836 bool DisableRuntimeUnroll = false; 7837 MDNode *OrigLoopID = L->getLoopID(); 7838 7839 if (!VectorizeLoop) { 7840 assert(IC > 1 && "interleave count should not be 1 or 0"); 7841 // If we decided that it is not legal to vectorize the loop, then 7842 // interleave it. 7843 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7844 &CM); 7845 LVP.executePlan(Unroller, DT); 7846 7847 ORE->emit([&]() { 7848 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7849 L->getHeader()) 7850 << "interleaved loop (interleaved count: " 7851 << NV("InterleaveCount", IC) << ")"; 7852 }); 7853 } else { 7854 // If we decided that it is *legal* to vectorize the loop, then do it. 7855 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7856 &LVL, &CM); 7857 LVP.executePlan(LB, DT); 7858 ++LoopsVectorized; 7859 7860 // Add metadata to disable runtime unrolling a scalar loop when there are 7861 // no runtime checks about strides and memory. A scalar loop that is 7862 // rarely used is not worth unrolling. 7863 if (!LB.areSafetyChecksAdded()) 7864 DisableRuntimeUnroll = true; 7865 7866 // Report the vectorization decision. 7867 ORE->emit([&]() { 7868 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7869 L->getHeader()) 7870 << "vectorized loop (vectorization width: " 7871 << NV("VectorizationFactor", VF.Width) 7872 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7873 }); 7874 } 7875 7876 Optional<MDNode *> RemainderLoopID = 7877 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7878 LLVMLoopVectorizeFollowupEpilogue}); 7879 if (RemainderLoopID.hasValue()) { 7880 L->setLoopID(RemainderLoopID.getValue()); 7881 } else { 7882 if (DisableRuntimeUnroll) 7883 AddRuntimeUnrollDisableMetaData(L); 7884 7885 // Mark the loop as already vectorized to avoid vectorizing again. 7886 Hints.setAlreadyVectorized(); 7887 } 7888 7889 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7890 return true; 7891 } 7892 7893 bool LoopVectorizePass::runImpl( 7894 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7895 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7896 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 7897 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 7898 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 7899 SE = &SE_; 7900 LI = &LI_; 7901 TTI = &TTI_; 7902 DT = &DT_; 7903 BFI = &BFI_; 7904 TLI = TLI_; 7905 AA = &AA_; 7906 AC = &AC_; 7907 GetLAA = &GetLAA_; 7908 DB = &DB_; 7909 ORE = &ORE_; 7910 PSI = PSI_; 7911 7912 // Don't attempt if 7913 // 1. the target claims to have no vector registers, and 7914 // 2. interleaving won't help ILP. 7915 // 7916 // The second condition is necessary because, even if the target has no 7917 // vector registers, loop vectorization may still enable scalar 7918 // interleaving. 7919 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 7920 TTI->getMaxInterleaveFactor(1) < 2) 7921 return false; 7922 7923 bool Changed = false; 7924 7925 // The vectorizer requires loops to be in simplified form. 7926 // Since simplification may add new inner loops, it has to run before the 7927 // legality and profitability checks. This means running the loop vectorizer 7928 // will simplify all loops, regardless of whether anything end up being 7929 // vectorized. 7930 for (auto &L : *LI) 7931 Changed |= 7932 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 7933 7934 // Build up a worklist of inner-loops to vectorize. This is necessary as 7935 // the act of vectorizing or partially unrolling a loop creates new loops 7936 // and can invalidate iterators across the loops. 7937 SmallVector<Loop *, 8> Worklist; 7938 7939 for (Loop *L : *LI) 7940 collectSupportedLoops(*L, LI, ORE, Worklist); 7941 7942 LoopsAnalyzed += Worklist.size(); 7943 7944 // Now walk the identified inner loops. 7945 while (!Worklist.empty()) { 7946 Loop *L = Worklist.pop_back_val(); 7947 7948 // For the inner loops we actually process, form LCSSA to simplify the 7949 // transform. 7950 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 7951 7952 Changed |= processLoop(L); 7953 } 7954 7955 // Process each loop nest in the function. 7956 return Changed; 7957 } 7958 7959 PreservedAnalyses LoopVectorizePass::run(Function &F, 7960 FunctionAnalysisManager &AM) { 7961 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 7962 auto &LI = AM.getResult<LoopAnalysis>(F); 7963 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 7964 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 7965 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 7966 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 7967 auto &AA = AM.getResult<AAManager>(F); 7968 auto &AC = AM.getResult<AssumptionAnalysis>(F); 7969 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 7970 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 7971 MemorySSA *MSSA = EnableMSSALoopDependency 7972 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 7973 : nullptr; 7974 7975 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 7976 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 7977 [&](Loop &L) -> const LoopAccessInfo & { 7978 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 7979 return LAM.getResult<LoopAccessAnalysis>(L, AR); 7980 }; 7981 const ModuleAnalysisManager &MAM = 7982 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); 7983 ProfileSummaryInfo *PSI = 7984 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 7985 bool Changed = 7986 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 7987 if (!Changed) 7988 return PreservedAnalyses::all(); 7989 PreservedAnalyses PA; 7990 7991 // We currently do not preserve loopinfo/dominator analyses with outer loop 7992 // vectorization. Until this is addressed, mark these analyses as preserved 7993 // only for non-VPlan-native path. 7994 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 7995 if (!EnableVPlanNativePath) { 7996 PA.preserve<LoopAnalysis>(); 7997 PA.preserve<DominatorTreeAnalysis>(); 7998 } 7999 PA.preserve<BasicAA>(); 8000 PA.preserve<GlobalsAA>(); 8001 return PA; 8002 } 8003