1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpander.h" 95 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 96 #include "llvm/Analysis/TargetLibraryInfo.h" 97 #include "llvm/Analysis/TargetTransformInfo.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfoMetadata.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/LLVMContext.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/InitializePasses.h" 128 #include "llvm/Pass.h" 129 #include "llvm/Support/Casting.h" 130 #include "llvm/Support/CommandLine.h" 131 #include "llvm/Support/Compiler.h" 132 #include "llvm/Support/Debug.h" 133 #include "llvm/Support/ErrorHandling.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/LoopSimplify.h" 138 #include "llvm/Transforms/Utils/LoopUtils.h" 139 #include "llvm/Transforms/Utils/LoopVersioning.h" 140 #include "llvm/Transforms/Utils/SizeOpts.h" 141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 142 #include <algorithm> 143 #include <cassert> 144 #include <cstdint> 145 #include <cstdlib> 146 #include <functional> 147 #include <iterator> 148 #include <limits> 149 #include <memory> 150 #include <string> 151 #include <tuple> 152 #include <utility> 153 154 using namespace llvm; 155 156 #define LV_NAME "loop-vectorize" 157 #define DEBUG_TYPE LV_NAME 158 159 /// @{ 160 /// Metadata attribute names 161 static const char *const LLVMLoopVectorizeFollowupAll = 162 "llvm.loop.vectorize.followup_all"; 163 static const char *const LLVMLoopVectorizeFollowupVectorized = 164 "llvm.loop.vectorize.followup_vectorized"; 165 static const char *const LLVMLoopVectorizeFollowupEpilogue = 166 "llvm.loop.vectorize.followup_epilogue"; 167 /// @} 168 169 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 171 172 /// Loops with a known constant trip count below this number are vectorized only 173 /// if no scalar iteration overheads are incurred. 174 static cl::opt<unsigned> TinyTripCountVectorThreshold( 175 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 176 cl::desc("Loops with a constant trip count that is smaller than this " 177 "value are vectorized only if no scalar iteration overheads " 178 "are incurred.")); 179 180 // Indicates that an epilogue is undesired, predication is preferred. 181 // This means that the vectorizer will try to fold the loop-tail (epilogue) 182 // into the loop and predicate the loop body accordingly. 183 static cl::opt<bool> PreferPredicateOverEpilog( 184 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 185 cl::desc("Indicate that an epilogue is undesired, predication should be " 186 "used instead.")); 187 188 static cl::opt<bool> MaximizeBandwidth( 189 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 190 cl::desc("Maximize bandwidth when selecting vectorization factor which " 191 "will be determined by the smallest type in loop.")); 192 193 static cl::opt<bool> EnableInterleavedMemAccesses( 194 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 195 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 196 197 /// An interleave-group may need masking if it resides in a block that needs 198 /// predication, or in order to mask away gaps. 199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 200 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 201 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 202 203 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 204 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 205 cl::desc("We don't interleave loops with a estimated constant trip count " 206 "below this number")); 207 208 static cl::opt<unsigned> ForceTargetNumScalarRegs( 209 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 210 cl::desc("A flag that overrides the target's number of scalar registers.")); 211 212 static cl::opt<unsigned> ForceTargetNumVectorRegs( 213 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 214 cl::desc("A flag that overrides the target's number of vector registers.")); 215 216 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 217 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 218 cl::desc("A flag that overrides the target's max interleave factor for " 219 "scalar loops.")); 220 221 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 222 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 223 cl::desc("A flag that overrides the target's max interleave factor for " 224 "vectorized loops.")); 225 226 static cl::opt<unsigned> ForceTargetInstructionCost( 227 "force-target-instruction-cost", cl::init(0), cl::Hidden, 228 cl::desc("A flag that overrides the target's expected cost for " 229 "an instruction to a single constant value. Mostly " 230 "useful for getting consistent testing.")); 231 232 static cl::opt<unsigned> SmallLoopCost( 233 "small-loop-cost", cl::init(20), cl::Hidden, 234 cl::desc( 235 "The cost of a loop that is considered 'small' by the interleaver.")); 236 237 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 238 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 239 cl::desc("Enable the use of the block frequency analysis to access PGO " 240 "heuristics minimizing code growth in cold regions and being more " 241 "aggressive in hot regions.")); 242 243 // Runtime interleave loops for load/store throughput. 244 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 245 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 246 cl::desc( 247 "Enable runtime interleaving until load/store ports are saturated")); 248 249 /// The number of stores in a loop that are allowed to need predication. 250 static cl::opt<unsigned> NumberOfStoresToPredicate( 251 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 252 cl::desc("Max number of stores to be predicated behind an if.")); 253 254 static cl::opt<bool> EnableIndVarRegisterHeur( 255 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 256 cl::desc("Count the induction variable only once when interleaving")); 257 258 static cl::opt<bool> EnableCondStoresVectorization( 259 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 260 cl::desc("Enable if predication of stores during vectorization.")); 261 262 static cl::opt<unsigned> MaxNestedScalarReductionIC( 263 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 264 cl::desc("The maximum interleave count to use when interleaving a scalar " 265 "reduction in a nested loop.")); 266 267 cl::opt<bool> EnableVPlanNativePath( 268 "enable-vplan-native-path", cl::init(false), cl::Hidden, 269 cl::desc("Enable VPlan-native vectorization path with " 270 "support for outer loop vectorization.")); 271 272 // FIXME: Remove this switch once we have divergence analysis. Currently we 273 // assume divergent non-backedge branches when this switch is true. 274 cl::opt<bool> EnableVPlanPredication( 275 "enable-vplan-predication", cl::init(false), cl::Hidden, 276 cl::desc("Enable VPlan-native vectorization path predicator with " 277 "support for outer loop vectorization.")); 278 279 // This flag enables the stress testing of the VPlan H-CFG construction in the 280 // VPlan-native vectorization path. It must be used in conjuction with 281 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 282 // verification of the H-CFGs built. 283 static cl::opt<bool> VPlanBuildStressTest( 284 "vplan-build-stress-test", cl::init(false), cl::Hidden, 285 cl::desc( 286 "Build VPlan for every supported loop nest in the function and bail " 287 "out right after the build (stress test the VPlan H-CFG construction " 288 "in the VPlan-native vectorization path).")); 289 290 cl::opt<bool> llvm::EnableLoopInterleaving( 291 "interleave-loops", cl::init(true), cl::Hidden, 292 cl::desc("Enable loop interleaving in Loop vectorization passes")); 293 cl::opt<bool> llvm::EnableLoopVectorization( 294 "vectorize-loops", cl::init(true), cl::Hidden, 295 cl::desc("Run the Loop vectorization passes")); 296 297 /// A helper function for converting Scalar types to vector types. 298 /// If the incoming type is void, we return void. If the VF is 1, we return 299 /// the scalar type. 300 static Type *ToVectorTy(Type *Scalar, unsigned VF) { 301 if (Scalar->isVoidTy() || VF == 1) 302 return Scalar; 303 return VectorType::get(Scalar, VF); 304 } 305 306 /// A helper function that returns the type of loaded or stored value. 307 static Type *getMemInstValueType(Value *I) { 308 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 309 "Expected Load or Store instruction"); 310 if (auto *LI = dyn_cast<LoadInst>(I)) 311 return LI->getType(); 312 return cast<StoreInst>(I)->getValueOperand()->getType(); 313 } 314 315 /// A helper function that returns true if the given type is irregular. The 316 /// type is irregular if its allocated size doesn't equal the store size of an 317 /// element of the corresponding vector type at the given vectorization factor. 318 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 319 // Determine if an array of VF elements of type Ty is "bitcast compatible" 320 // with a <VF x Ty> vector. 321 if (VF > 1) { 322 auto *VectorTy = VectorType::get(Ty, VF); 323 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 324 } 325 326 // If the vectorization factor is one, we just check if an array of type Ty 327 // requires padding between elements. 328 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 329 } 330 331 /// A helper function that returns the reciprocal of the block probability of 332 /// predicated blocks. If we return X, we are assuming the predicated block 333 /// will execute once for every X iterations of the loop header. 334 /// 335 /// TODO: We should use actual block probability here, if available. Currently, 336 /// we always assume predicated blocks have a 50% chance of executing. 337 static unsigned getReciprocalPredBlockProb() { return 2; } 338 339 /// A helper function that adds a 'fast' flag to floating-point operations. 340 static Value *addFastMathFlag(Value *V) { 341 if (isa<FPMathOperator>(V)) 342 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 343 return V; 344 } 345 346 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 347 if (isa<FPMathOperator>(V)) 348 cast<Instruction>(V)->setFastMathFlags(FMF); 349 return V; 350 } 351 352 /// A helper function that returns an integer or floating-point constant with 353 /// value C. 354 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 355 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 356 : ConstantFP::get(Ty, C); 357 } 358 359 /// Returns "best known" trip count for the specified loop \p L as defined by 360 /// the following procedure: 361 /// 1) Returns exact trip count if it is known. 362 /// 2) Returns expected trip count according to profile data if any. 363 /// 3) Returns upper bound estimate if it is known. 364 /// 4) Returns None if all of the above failed. 365 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 366 // Check if exact trip count is known. 367 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 368 return ExpectedTC; 369 370 // Check if there is an expected trip count available from profile data. 371 if (LoopVectorizeWithBlockFrequency) 372 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 373 return EstimatedTC; 374 375 // Check if upper bound estimate is known. 376 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 377 return ExpectedTC; 378 379 return None; 380 } 381 382 namespace llvm { 383 384 /// InnerLoopVectorizer vectorizes loops which contain only one basic 385 /// block to a specified vectorization factor (VF). 386 /// This class performs the widening of scalars into vectors, or multiple 387 /// scalars. This class also implements the following features: 388 /// * It inserts an epilogue loop for handling loops that don't have iteration 389 /// counts that are known to be a multiple of the vectorization factor. 390 /// * It handles the code generation for reduction variables. 391 /// * Scalarization (implementation using scalars) of un-vectorizable 392 /// instructions. 393 /// InnerLoopVectorizer does not perform any vectorization-legality 394 /// checks, and relies on the caller to check for the different legality 395 /// aspects. The InnerLoopVectorizer relies on the 396 /// LoopVectorizationLegality class to provide information about the induction 397 /// and reduction variables that were found to a given vectorization factor. 398 class InnerLoopVectorizer { 399 public: 400 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 401 LoopInfo *LI, DominatorTree *DT, 402 const TargetLibraryInfo *TLI, 403 const TargetTransformInfo *TTI, AssumptionCache *AC, 404 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 405 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 406 LoopVectorizationCostModel *CM) 407 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 408 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 409 Builder(PSE.getSE()->getContext()), 410 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 411 virtual ~InnerLoopVectorizer() = default; 412 413 /// Create a new empty loop. Unlink the old loop and connect the new one. 414 /// Return the pre-header block of the new loop. 415 BasicBlock *createVectorizedLoopSkeleton(); 416 417 /// Widen a single instruction within the innermost loop. 418 void widenInstruction(Instruction &I); 419 420 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 421 void fixVectorizedLoop(); 422 423 // Return true if any runtime check is added. 424 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 425 426 /// A type for vectorized values in the new loop. Each value from the 427 /// original loop, when vectorized, is represented by UF vector values in the 428 /// new unrolled loop, where UF is the unroll factor. 429 using VectorParts = SmallVector<Value *, 2>; 430 431 /// Vectorize a single GetElementPtrInst based on information gathered and 432 /// decisions taken during planning. 433 void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, 434 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); 435 436 /// Vectorize a single PHINode in a block. This method handles the induction 437 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 438 /// arbitrary length vectors. 439 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 440 441 /// A helper function to scalarize a single Instruction in the innermost loop. 442 /// Generates a sequence of scalar instances for each lane between \p MinLane 443 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 444 /// inclusive.. 445 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, 446 bool IfPredicateInstr); 447 448 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 449 /// is provided, the integer induction variable will first be truncated to 450 /// the corresponding type. 451 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 452 453 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 454 /// vector or scalar value on-demand if one is not yet available. When 455 /// vectorizing a loop, we visit the definition of an instruction before its 456 /// uses. When visiting the definition, we either vectorize or scalarize the 457 /// instruction, creating an entry for it in the corresponding map. (In some 458 /// cases, such as induction variables, we will create both vector and scalar 459 /// entries.) Then, as we encounter uses of the definition, we derive values 460 /// for each scalar or vector use unless such a value is already available. 461 /// For example, if we scalarize a definition and one of its uses is vector, 462 /// we build the required vector on-demand with an insertelement sequence 463 /// when visiting the use. Otherwise, if the use is scalar, we can use the 464 /// existing scalar definition. 465 /// 466 /// Return a value in the new loop corresponding to \p V from the original 467 /// loop at unroll index \p Part. If the value has already been vectorized, 468 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 469 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 470 /// a new vector value on-demand by inserting the scalar values into a vector 471 /// with an insertelement sequence. If the value has been neither vectorized 472 /// nor scalarized, it must be loop invariant, so we simply broadcast the 473 /// value into a vector. 474 Value *getOrCreateVectorValue(Value *V, unsigned Part); 475 476 /// Return a value in the new loop corresponding to \p V from the original 477 /// loop at unroll and vector indices \p Instance. If the value has been 478 /// vectorized but not scalarized, the necessary extractelement instruction 479 /// will be generated. 480 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 481 482 /// Construct the vector value of a scalarized value \p V one lane at a time. 483 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 484 485 /// Try to vectorize the interleaved access group that \p Instr belongs to, 486 /// optionally masking the vector operations if \p BlockInMask is non-null. 487 void vectorizeInterleaveGroup(Instruction *Instr, 488 VectorParts *BlockInMask = nullptr); 489 490 /// Vectorize Load and Store instructions, optionally masking the vector 491 /// operations if \p BlockInMask is non-null. 492 void vectorizeMemoryInstruction(Instruction *Instr, 493 VectorParts *BlockInMask = nullptr); 494 495 /// Set the debug location in the builder using the debug location in 496 /// the instruction. 497 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 498 499 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 500 void fixNonInductionPHIs(void); 501 502 protected: 503 friend class LoopVectorizationPlanner; 504 505 /// A small list of PHINodes. 506 using PhiVector = SmallVector<PHINode *, 4>; 507 508 /// A type for scalarized values in the new loop. Each value from the 509 /// original loop, when scalarized, is represented by UF x VF scalar values 510 /// in the new unrolled loop, where UF is the unroll factor and VF is the 511 /// vectorization factor. 512 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 513 514 /// Set up the values of the IVs correctly when exiting the vector loop. 515 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 516 Value *CountRoundDown, Value *EndValue, 517 BasicBlock *MiddleBlock); 518 519 /// Create a new induction variable inside L. 520 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 521 Value *Step, Instruction *DL); 522 523 /// Handle all cross-iteration phis in the header. 524 void fixCrossIterationPHIs(); 525 526 /// Fix a first-order recurrence. This is the second phase of vectorizing 527 /// this phi node. 528 void fixFirstOrderRecurrence(PHINode *Phi); 529 530 /// Fix a reduction cross-iteration phi. This is the second phase of 531 /// vectorizing this phi node. 532 void fixReduction(PHINode *Phi); 533 534 /// The Loop exit block may have single value PHI nodes with some 535 /// incoming value. While vectorizing we only handled real values 536 /// that were defined inside the loop and we should have one value for 537 /// each predecessor of its parent basic block. See PR14725. 538 void fixLCSSAPHIs(); 539 540 /// Iteratively sink the scalarized operands of a predicated instruction into 541 /// the block that was created for it. 542 void sinkScalarOperands(Instruction *PredInst); 543 544 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 545 /// represented as. 546 void truncateToMinimalBitwidths(); 547 548 /// Insert the new loop to the loop hierarchy and pass manager 549 /// and update the analysis passes. 550 void updateAnalysis(); 551 552 /// Create a broadcast instruction. This method generates a broadcast 553 /// instruction (shuffle) for loop invariant values and for the induction 554 /// value. If this is the induction variable then we extend it to N, N+1, ... 555 /// this is needed because each iteration in the loop corresponds to a SIMD 556 /// element. 557 virtual Value *getBroadcastInstrs(Value *V); 558 559 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 560 /// to each vector element of Val. The sequence starts at StartIndex. 561 /// \p Opcode is relevant for FP induction variable. 562 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 563 Instruction::BinaryOps Opcode = 564 Instruction::BinaryOpsEnd); 565 566 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 567 /// variable on which to base the steps, \p Step is the size of the step, and 568 /// \p EntryVal is the value from the original loop that maps to the steps. 569 /// Note that \p EntryVal doesn't have to be an induction variable - it 570 /// can also be a truncate instruction. 571 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 572 const InductionDescriptor &ID); 573 574 /// Create a vector induction phi node based on an existing scalar one. \p 575 /// EntryVal is the value from the original loop that maps to the vector phi 576 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 577 /// truncate instruction, instead of widening the original IV, we widen a 578 /// version of the IV truncated to \p EntryVal's type. 579 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 580 Value *Step, Instruction *EntryVal); 581 582 /// Returns true if an instruction \p I should be scalarized instead of 583 /// vectorized for the chosen vectorization factor. 584 bool shouldScalarizeInstruction(Instruction *I) const; 585 586 /// Returns true if we should generate a scalar version of \p IV. 587 bool needsScalarInduction(Instruction *IV) const; 588 589 /// If there is a cast involved in the induction variable \p ID, which should 590 /// be ignored in the vectorized loop body, this function records the 591 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 592 /// cast. We had already proved that the casted Phi is equal to the uncasted 593 /// Phi in the vectorized loop (under a runtime guard), and therefore 594 /// there is no need to vectorize the cast - the same value can be used in the 595 /// vector loop for both the Phi and the cast. 596 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 597 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 598 /// 599 /// \p EntryVal is the value from the original loop that maps to the vector 600 /// phi node and is used to distinguish what is the IV currently being 601 /// processed - original one (if \p EntryVal is a phi corresponding to the 602 /// original IV) or the "newly-created" one based on the proof mentioned above 603 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 604 /// latter case \p EntryVal is a TruncInst and we must not record anything for 605 /// that IV, but it's error-prone to expect callers of this routine to care 606 /// about that, hence this explicit parameter. 607 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 608 const Instruction *EntryVal, 609 Value *VectorLoopValue, 610 unsigned Part, 611 unsigned Lane = UINT_MAX); 612 613 /// Generate a shuffle sequence that will reverse the vector Vec. 614 virtual Value *reverseVector(Value *Vec); 615 616 /// Returns (and creates if needed) the original loop trip count. 617 Value *getOrCreateTripCount(Loop *NewLoop); 618 619 /// Returns (and creates if needed) the trip count of the widened loop. 620 Value *getOrCreateVectorTripCount(Loop *NewLoop); 621 622 /// Returns a bitcasted value to the requested vector type. 623 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 624 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 625 const DataLayout &DL); 626 627 /// Emit a bypass check to see if the vector trip count is zero, including if 628 /// it overflows. 629 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 630 631 /// Emit a bypass check to see if all of the SCEV assumptions we've 632 /// had to make are correct. 633 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 634 635 /// Emit bypass checks to check any memory assumptions we may have made. 636 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 637 638 /// Compute the transformed value of Index at offset StartValue using step 639 /// StepValue. 640 /// For integer induction, returns StartValue + Index * StepValue. 641 /// For pointer induction, returns StartValue[Index * StepValue]. 642 /// FIXME: The newly created binary instructions should contain nsw/nuw 643 /// flags, which can be found from the original scalar operations. 644 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 645 const DataLayout &DL, 646 const InductionDescriptor &ID) const; 647 648 /// Add additional metadata to \p To that was not present on \p Orig. 649 /// 650 /// Currently this is used to add the noalias annotations based on the 651 /// inserted memchecks. Use this for instructions that are *cloned* into the 652 /// vector loop. 653 void addNewMetadata(Instruction *To, const Instruction *Orig); 654 655 /// Add metadata from one instruction to another. 656 /// 657 /// This includes both the original MDs from \p From and additional ones (\see 658 /// addNewMetadata). Use this for *newly created* instructions in the vector 659 /// loop. 660 void addMetadata(Instruction *To, Instruction *From); 661 662 /// Similar to the previous function but it adds the metadata to a 663 /// vector of instructions. 664 void addMetadata(ArrayRef<Value *> To, Instruction *From); 665 666 /// The original loop. 667 Loop *OrigLoop; 668 669 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 670 /// dynamic knowledge to simplify SCEV expressions and converts them to a 671 /// more usable form. 672 PredicatedScalarEvolution &PSE; 673 674 /// Loop Info. 675 LoopInfo *LI; 676 677 /// Dominator Tree. 678 DominatorTree *DT; 679 680 /// Alias Analysis. 681 AliasAnalysis *AA; 682 683 /// Target Library Info. 684 const TargetLibraryInfo *TLI; 685 686 /// Target Transform Info. 687 const TargetTransformInfo *TTI; 688 689 /// Assumption Cache. 690 AssumptionCache *AC; 691 692 /// Interface to emit optimization remarks. 693 OptimizationRemarkEmitter *ORE; 694 695 /// LoopVersioning. It's only set up (non-null) if memchecks were 696 /// used. 697 /// 698 /// This is currently only used to add no-alias metadata based on the 699 /// memchecks. The actually versioning is performed manually. 700 std::unique_ptr<LoopVersioning> LVer; 701 702 /// The vectorization SIMD factor to use. Each vector will have this many 703 /// vector elements. 704 unsigned VF; 705 706 /// The vectorization unroll factor to use. Each scalar is vectorized to this 707 /// many different vector instructions. 708 unsigned UF; 709 710 /// The builder that we use 711 IRBuilder<> Builder; 712 713 // --- Vectorization state --- 714 715 /// The vector-loop preheader. 716 BasicBlock *LoopVectorPreHeader; 717 718 /// The scalar-loop preheader. 719 BasicBlock *LoopScalarPreHeader; 720 721 /// Middle Block between the vector and the scalar. 722 BasicBlock *LoopMiddleBlock; 723 724 /// The ExitBlock of the scalar loop. 725 BasicBlock *LoopExitBlock; 726 727 /// The vector loop body. 728 BasicBlock *LoopVectorBody; 729 730 /// The scalar loop body. 731 BasicBlock *LoopScalarBody; 732 733 /// A list of all bypass blocks. The first block is the entry of the loop. 734 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 735 736 /// The new Induction variable which was added to the new block. 737 PHINode *Induction = nullptr; 738 739 /// The induction variable of the old basic block. 740 PHINode *OldInduction = nullptr; 741 742 /// Maps values from the original loop to their corresponding values in the 743 /// vectorized loop. A key value can map to either vector values, scalar 744 /// values or both kinds of values, depending on whether the key was 745 /// vectorized and scalarized. 746 VectorizerValueMap VectorLoopValueMap; 747 748 /// Store instructions that were predicated. 749 SmallVector<Instruction *, 4> PredicatedInstructions; 750 751 /// Trip count of the original loop. 752 Value *TripCount = nullptr; 753 754 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 755 Value *VectorTripCount = nullptr; 756 757 /// The legality analysis. 758 LoopVectorizationLegality *Legal; 759 760 /// The profitablity analysis. 761 LoopVectorizationCostModel *Cost; 762 763 // Record whether runtime checks are added. 764 bool AddedSafetyChecks = false; 765 766 // Holds the end values for each induction variable. We save the end values 767 // so we can later fix-up the external users of the induction variables. 768 DenseMap<PHINode *, Value *> IVEndValues; 769 770 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 771 // fixed up at the end of vector code generation. 772 SmallVector<PHINode *, 8> OrigPHIsToFix; 773 }; 774 775 class InnerLoopUnroller : public InnerLoopVectorizer { 776 public: 777 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 778 LoopInfo *LI, DominatorTree *DT, 779 const TargetLibraryInfo *TLI, 780 const TargetTransformInfo *TTI, AssumptionCache *AC, 781 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 782 LoopVectorizationLegality *LVL, 783 LoopVectorizationCostModel *CM) 784 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 785 UnrollFactor, LVL, CM) {} 786 787 private: 788 Value *getBroadcastInstrs(Value *V) override; 789 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 790 Instruction::BinaryOps Opcode = 791 Instruction::BinaryOpsEnd) override; 792 Value *reverseVector(Value *Vec) override; 793 }; 794 795 } // end namespace llvm 796 797 /// Look for a meaningful debug location on the instruction or it's 798 /// operands. 799 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 800 if (!I) 801 return I; 802 803 DebugLoc Empty; 804 if (I->getDebugLoc() != Empty) 805 return I; 806 807 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 808 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 809 if (OpInst->getDebugLoc() != Empty) 810 return OpInst; 811 } 812 813 return I; 814 } 815 816 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 817 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 818 const DILocation *DIL = Inst->getDebugLoc(); 819 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 820 !isa<DbgInfoIntrinsic>(Inst)) { 821 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 822 if (NewDIL) 823 B.SetCurrentDebugLocation(NewDIL.getValue()); 824 else 825 LLVM_DEBUG(dbgs() 826 << "Failed to create new discriminator: " 827 << DIL->getFilename() << " Line: " << DIL->getLine()); 828 } 829 else 830 B.SetCurrentDebugLocation(DIL); 831 } else 832 B.SetCurrentDebugLocation(DebugLoc()); 833 } 834 835 /// Write a record \p DebugMsg about vectorization failure to the debug 836 /// output stream. If \p I is passed, it is an instruction that prevents 837 /// vectorization. 838 #ifndef NDEBUG 839 static void debugVectorizationFailure(const StringRef DebugMsg, 840 Instruction *I) { 841 dbgs() << "LV: Not vectorizing: " << DebugMsg; 842 if (I != nullptr) 843 dbgs() << " " << *I; 844 else 845 dbgs() << '.'; 846 dbgs() << '\n'; 847 } 848 #endif 849 850 /// Create an analysis remark that explains why vectorization failed 851 /// 852 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 853 /// RemarkName is the identifier for the remark. If \p I is passed it is an 854 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 855 /// the location of the remark. \return the remark object that can be 856 /// streamed to. 857 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 858 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 859 Value *CodeRegion = TheLoop->getHeader(); 860 DebugLoc DL = TheLoop->getStartLoc(); 861 862 if (I) { 863 CodeRegion = I->getParent(); 864 // If there is no debug location attached to the instruction, revert back to 865 // using the loop's. 866 if (I->getDebugLoc()) 867 DL = I->getDebugLoc(); 868 } 869 870 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 871 R << "loop not vectorized: "; 872 return R; 873 } 874 875 namespace llvm { 876 877 void reportVectorizationFailure(const StringRef DebugMsg, 878 const StringRef OREMsg, const StringRef ORETag, 879 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 880 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 881 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 882 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 883 ORETag, TheLoop, I) << OREMsg); 884 } 885 886 } // end namespace llvm 887 888 #ifndef NDEBUG 889 /// \return string containing a file name and a line # for the given loop. 890 static std::string getDebugLocString(const Loop *L) { 891 std::string Result; 892 if (L) { 893 raw_string_ostream OS(Result); 894 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 895 LoopDbgLoc.print(OS); 896 else 897 // Just print the module name. 898 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 899 OS.flush(); 900 } 901 return Result; 902 } 903 #endif 904 905 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 906 const Instruction *Orig) { 907 // If the loop was versioned with memchecks, add the corresponding no-alias 908 // metadata. 909 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 910 LVer->annotateInstWithNoAlias(To, Orig); 911 } 912 913 void InnerLoopVectorizer::addMetadata(Instruction *To, 914 Instruction *From) { 915 propagateMetadata(To, From); 916 addNewMetadata(To, From); 917 } 918 919 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 920 Instruction *From) { 921 for (Value *V : To) { 922 if (Instruction *I = dyn_cast<Instruction>(V)) 923 addMetadata(I, From); 924 } 925 } 926 927 namespace llvm { 928 929 // Loop vectorization cost-model hints how the scalar epilogue loop should be 930 // lowered. 931 enum ScalarEpilogueLowering { 932 933 // The default: allowing scalar epilogues. 934 CM_ScalarEpilogueAllowed, 935 936 // Vectorization with OptForSize: don't allow epilogues. 937 CM_ScalarEpilogueNotAllowedOptSize, 938 939 // A special case of vectorisation with OptForSize: loops with a very small 940 // trip count are considered for vectorization under OptForSize, thereby 941 // making sure the cost of their loop body is dominant, free of runtime 942 // guards and scalar iteration overheads. 943 CM_ScalarEpilogueNotAllowedLowTripLoop, 944 945 // Loop hint predicate indicating an epilogue is undesired. 946 CM_ScalarEpilogueNotNeededUsePredicate 947 }; 948 949 /// LoopVectorizationCostModel - estimates the expected speedups due to 950 /// vectorization. 951 /// In many cases vectorization is not profitable. This can happen because of 952 /// a number of reasons. In this class we mainly attempt to predict the 953 /// expected speedup/slowdowns due to the supported instruction set. We use the 954 /// TargetTransformInfo to query the different backends for the cost of 955 /// different operations. 956 class LoopVectorizationCostModel { 957 public: 958 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 959 PredicatedScalarEvolution &PSE, LoopInfo *LI, 960 LoopVectorizationLegality *Legal, 961 const TargetTransformInfo &TTI, 962 const TargetLibraryInfo *TLI, DemandedBits *DB, 963 AssumptionCache *AC, 964 OptimizationRemarkEmitter *ORE, const Function *F, 965 const LoopVectorizeHints *Hints, 966 InterleavedAccessInfo &IAI) 967 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 968 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 969 Hints(Hints), InterleaveInfo(IAI) {} 970 971 /// \return An upper bound for the vectorization factor, or None if 972 /// vectorization and interleaving should be avoided up front. 973 Optional<unsigned> computeMaxVF(); 974 975 /// \return True if runtime checks are required for vectorization, and false 976 /// otherwise. 977 bool runtimeChecksRequired(); 978 979 /// \return The most profitable vectorization factor and the cost of that VF. 980 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 981 /// then this vectorization factor will be selected if vectorization is 982 /// possible. 983 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 984 985 /// Setup cost-based decisions for user vectorization factor. 986 void selectUserVectorizationFactor(unsigned UserVF) { 987 collectUniformsAndScalars(UserVF); 988 collectInstsToScalarize(UserVF); 989 } 990 991 /// \return The size (in bits) of the smallest and widest types in the code 992 /// that needs to be vectorized. We ignore values that remain scalar such as 993 /// 64 bit loop indices. 994 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 995 996 /// \return The desired interleave count. 997 /// If interleave count has been specified by metadata it will be returned. 998 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 999 /// are the selected vectorization factor and the cost of the selected VF. 1000 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 1001 1002 /// Memory access instruction may be vectorized in more than one way. 1003 /// Form of instruction after vectorization depends on cost. 1004 /// This function takes cost-based decisions for Load/Store instructions 1005 /// and collects them in a map. This decisions map is used for building 1006 /// the lists of loop-uniform and loop-scalar instructions. 1007 /// The calculated cost is saved with widening decision in order to 1008 /// avoid redundant calculations. 1009 void setCostBasedWideningDecision(unsigned VF); 1010 1011 /// A struct that represents some properties of the register usage 1012 /// of a loop. 1013 struct RegisterUsage { 1014 /// Holds the number of loop invariant values that are used in the loop. 1015 /// The key is ClassID of target-provided register class. 1016 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1017 /// Holds the maximum number of concurrent live intervals in the loop. 1018 /// The key is ClassID of target-provided register class. 1019 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1020 }; 1021 1022 /// \return Returns information about the register usages of the loop for the 1023 /// given vectorization factors. 1024 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1025 1026 /// Collect values we want to ignore in the cost model. 1027 void collectValuesToIgnore(); 1028 1029 /// \returns The smallest bitwidth each instruction can be represented with. 1030 /// The vector equivalents of these instructions should be truncated to this 1031 /// type. 1032 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1033 return MinBWs; 1034 } 1035 1036 /// \returns True if it is more profitable to scalarize instruction \p I for 1037 /// vectorization factor \p VF. 1038 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1039 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1040 1041 // Cost model is not run in the VPlan-native path - return conservative 1042 // result until this changes. 1043 if (EnableVPlanNativePath) 1044 return false; 1045 1046 auto Scalars = InstsToScalarize.find(VF); 1047 assert(Scalars != InstsToScalarize.end() && 1048 "VF not yet analyzed for scalarization profitability"); 1049 return Scalars->second.find(I) != Scalars->second.end(); 1050 } 1051 1052 /// Returns true if \p I is known to be uniform after vectorization. 1053 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1054 if (VF == 1) 1055 return true; 1056 1057 // Cost model is not run in the VPlan-native path - return conservative 1058 // result until this changes. 1059 if (EnableVPlanNativePath) 1060 return false; 1061 1062 auto UniformsPerVF = Uniforms.find(VF); 1063 assert(UniformsPerVF != Uniforms.end() && 1064 "VF not yet analyzed for uniformity"); 1065 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); 1066 } 1067 1068 /// Returns true if \p I is known to be scalar after vectorization. 1069 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1070 if (VF == 1) 1071 return true; 1072 1073 // Cost model is not run in the VPlan-native path - return conservative 1074 // result until this changes. 1075 if (EnableVPlanNativePath) 1076 return false; 1077 1078 auto ScalarsPerVF = Scalars.find(VF); 1079 assert(ScalarsPerVF != Scalars.end() && 1080 "Scalar values are not calculated for VF"); 1081 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); 1082 } 1083 1084 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1085 /// for vectorization factor \p VF. 1086 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1087 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1088 !isProfitableToScalarize(I, VF) && 1089 !isScalarAfterVectorization(I, VF); 1090 } 1091 1092 /// Decision that was taken during cost calculation for memory instruction. 1093 enum InstWidening { 1094 CM_Unknown, 1095 CM_Widen, // For consecutive accesses with stride +1. 1096 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1097 CM_Interleave, 1098 CM_GatherScatter, 1099 CM_Scalarize 1100 }; 1101 1102 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1103 /// instruction \p I and vector width \p VF. 1104 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1105 unsigned Cost) { 1106 assert(VF >= 2 && "Expected VF >=2"); 1107 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1108 } 1109 1110 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1111 /// interleaving group \p Grp and vector width \p VF. 1112 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1113 InstWidening W, unsigned Cost) { 1114 assert(VF >= 2 && "Expected VF >=2"); 1115 /// Broadcast this decicion to all instructions inside the group. 1116 /// But the cost will be assigned to one instruction only. 1117 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1118 if (auto *I = Grp->getMember(i)) { 1119 if (Grp->getInsertPos() == I) 1120 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1121 else 1122 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1123 } 1124 } 1125 } 1126 1127 /// Return the cost model decision for the given instruction \p I and vector 1128 /// width \p VF. Return CM_Unknown if this instruction did not pass 1129 /// through the cost modeling. 1130 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1131 assert(VF >= 2 && "Expected VF >=2"); 1132 1133 // Cost model is not run in the VPlan-native path - return conservative 1134 // result until this changes. 1135 if (EnableVPlanNativePath) 1136 return CM_GatherScatter; 1137 1138 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1139 auto Itr = WideningDecisions.find(InstOnVF); 1140 if (Itr == WideningDecisions.end()) 1141 return CM_Unknown; 1142 return Itr->second.first; 1143 } 1144 1145 /// Return the vectorization cost for the given instruction \p I and vector 1146 /// width \p VF. 1147 unsigned getWideningCost(Instruction *I, unsigned VF) { 1148 assert(VF >= 2 && "Expected VF >=2"); 1149 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1150 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1151 "The cost is not calculated"); 1152 return WideningDecisions[InstOnVF].second; 1153 } 1154 1155 /// Return True if instruction \p I is an optimizable truncate whose operand 1156 /// is an induction variable. Such a truncate will be removed by adding a new 1157 /// induction variable with the destination type. 1158 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1159 // If the instruction is not a truncate, return false. 1160 auto *Trunc = dyn_cast<TruncInst>(I); 1161 if (!Trunc) 1162 return false; 1163 1164 // Get the source and destination types of the truncate. 1165 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1166 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1167 1168 // If the truncate is free for the given types, return false. Replacing a 1169 // free truncate with an induction variable would add an induction variable 1170 // update instruction to each iteration of the loop. We exclude from this 1171 // check the primary induction variable since it will need an update 1172 // instruction regardless. 1173 Value *Op = Trunc->getOperand(0); 1174 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1175 return false; 1176 1177 // If the truncated value is not an induction variable, return false. 1178 return Legal->isInductionPhi(Op); 1179 } 1180 1181 /// Collects the instructions to scalarize for each predicated instruction in 1182 /// the loop. 1183 void collectInstsToScalarize(unsigned VF); 1184 1185 /// Collect Uniform and Scalar values for the given \p VF. 1186 /// The sets depend on CM decision for Load/Store instructions 1187 /// that may be vectorized as interleave, gather-scatter or scalarized. 1188 void collectUniformsAndScalars(unsigned VF) { 1189 // Do the analysis once. 1190 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1191 return; 1192 setCostBasedWideningDecision(VF); 1193 collectLoopUniforms(VF); 1194 collectLoopScalars(VF); 1195 } 1196 1197 /// Returns true if the target machine supports masked store operation 1198 /// for the given \p DataType and kind of access to \p Ptr. 1199 bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1200 return Legal->isConsecutivePtr(Ptr) && 1201 TTI.isLegalMaskedStore(DataType, Alignment); 1202 } 1203 1204 /// Returns true if the target machine supports masked load operation 1205 /// for the given \p DataType and kind of access to \p Ptr. 1206 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1207 return Legal->isConsecutivePtr(Ptr) && 1208 TTI.isLegalMaskedLoad(DataType, Alignment); 1209 } 1210 1211 /// Returns true if the target machine supports masked scatter operation 1212 /// for the given \p DataType. 1213 bool isLegalMaskedScatter(Type *DataType) { 1214 return TTI.isLegalMaskedScatter(DataType); 1215 } 1216 1217 /// Returns true if the target machine supports masked gather operation 1218 /// for the given \p DataType. 1219 bool isLegalMaskedGather(Type *DataType) { 1220 return TTI.isLegalMaskedGather(DataType); 1221 } 1222 1223 /// Returns true if the target machine can represent \p V as a masked gather 1224 /// or scatter operation. 1225 bool isLegalGatherOrScatter(Value *V) { 1226 bool LI = isa<LoadInst>(V); 1227 bool SI = isa<StoreInst>(V); 1228 if (!LI && !SI) 1229 return false; 1230 auto *Ty = getMemInstValueType(V); 1231 return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); 1232 } 1233 1234 /// Returns true if \p I is an instruction that will be scalarized with 1235 /// predication. Such instructions include conditional stores and 1236 /// instructions that may divide by zero. 1237 /// If a non-zero VF has been calculated, we check if I will be scalarized 1238 /// predication for that VF. 1239 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1240 1241 // Returns true if \p I is an instruction that will be predicated either 1242 // through scalar predication or masked load/store or masked gather/scatter. 1243 // Superset of instructions that return true for isScalarWithPredication. 1244 bool isPredicatedInst(Instruction *I) { 1245 if (!blockNeedsPredication(I->getParent())) 1246 return false; 1247 // Loads and stores that need some form of masked operation are predicated 1248 // instructions. 1249 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1250 return Legal->isMaskRequired(I); 1251 return isScalarWithPredication(I); 1252 } 1253 1254 /// Returns true if \p I is a memory instruction with consecutive memory 1255 /// access that can be widened. 1256 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1257 1258 /// Returns true if \p I is a memory instruction in an interleaved-group 1259 /// of memory accesses that can be vectorized with wide vector loads/stores 1260 /// and shuffles. 1261 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1262 1263 /// Check if \p Instr belongs to any interleaved access group. 1264 bool isAccessInterleaved(Instruction *Instr) { 1265 return InterleaveInfo.isInterleaved(Instr); 1266 } 1267 1268 /// Get the interleaved access group that \p Instr belongs to. 1269 const InterleaveGroup<Instruction> * 1270 getInterleavedAccessGroup(Instruction *Instr) { 1271 return InterleaveInfo.getInterleaveGroup(Instr); 1272 } 1273 1274 /// Returns true if an interleaved group requires a scalar iteration 1275 /// to handle accesses with gaps, and there is nothing preventing us from 1276 /// creating a scalar epilogue. 1277 bool requiresScalarEpilogue() const { 1278 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1279 } 1280 1281 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1282 /// loop hint annotation. 1283 bool isScalarEpilogueAllowed() const { 1284 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1285 } 1286 1287 /// Returns true if all loop blocks should be masked to fold tail loop. 1288 bool foldTailByMasking() const { return FoldTailByMasking; } 1289 1290 bool blockNeedsPredication(BasicBlock *BB) { 1291 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1292 } 1293 1294 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1295 /// with factor VF. Return the cost of the instruction, including 1296 /// scalarization overhead if it's needed. 1297 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1298 1299 /// Estimate cost of a call instruction CI if it were vectorized with factor 1300 /// VF. Return the cost of the instruction, including scalarization overhead 1301 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1302 /// scalarized - 1303 /// i.e. either vector version isn't available, or is too expensive. 1304 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1305 1306 private: 1307 unsigned NumPredStores = 0; 1308 1309 /// \return An upper bound for the vectorization factor, larger than zero. 1310 /// One is returned if vectorization should best be avoided due to cost. 1311 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1312 1313 /// The vectorization cost is a combination of the cost itself and a boolean 1314 /// indicating whether any of the contributing operations will actually 1315 /// operate on 1316 /// vector values after type legalization in the backend. If this latter value 1317 /// is 1318 /// false, then all operations will be scalarized (i.e. no vectorization has 1319 /// actually taken place). 1320 using VectorizationCostTy = std::pair<unsigned, bool>; 1321 1322 /// Returns the expected execution cost. The unit of the cost does 1323 /// not matter because we use the 'cost' units to compare different 1324 /// vector widths. The cost that is returned is *not* normalized by 1325 /// the factor width. 1326 VectorizationCostTy expectedCost(unsigned VF); 1327 1328 /// Returns the execution time cost of an instruction for a given vector 1329 /// width. Vector width of one means scalar. 1330 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1331 1332 /// The cost-computation logic from getInstructionCost which provides 1333 /// the vector type as an output parameter. 1334 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1335 1336 /// Calculate vectorization cost of memory instruction \p I. 1337 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1338 1339 /// The cost computation for scalarized memory instruction. 1340 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1341 1342 /// The cost computation for interleaving group of memory instructions. 1343 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1344 1345 /// The cost computation for Gather/Scatter instruction. 1346 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1347 1348 /// The cost computation for widening instruction \p I with consecutive 1349 /// memory access. 1350 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1351 1352 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1353 /// Load: scalar load + broadcast. 1354 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1355 /// element) 1356 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1357 1358 /// Estimate the overhead of scalarizing an instruction. This is a 1359 /// convenience wrapper for the type-based getScalarizationOverhead API. 1360 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1361 1362 /// Returns whether the instruction is a load or store and will be a emitted 1363 /// as a vector operation. 1364 bool isConsecutiveLoadOrStore(Instruction *I); 1365 1366 /// Returns true if an artificially high cost for emulated masked memrefs 1367 /// should be used. 1368 bool useEmulatedMaskMemRefHack(Instruction *I); 1369 1370 /// Map of scalar integer values to the smallest bitwidth they can be legally 1371 /// represented as. The vector equivalents of these values should be truncated 1372 /// to this type. 1373 MapVector<Instruction *, uint64_t> MinBWs; 1374 1375 /// A type representing the costs for instructions if they were to be 1376 /// scalarized rather than vectorized. The entries are Instruction-Cost 1377 /// pairs. 1378 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1379 1380 /// A set containing all BasicBlocks that are known to present after 1381 /// vectorization as a predicated block. 1382 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1383 1384 /// Records whether it is allowed to have the original scalar loop execute at 1385 /// least once. This may be needed as a fallback loop in case runtime 1386 /// aliasing/dependence checks fail, or to handle the tail/remainder 1387 /// iterations when the trip count is unknown or doesn't divide by the VF, 1388 /// or as a peel-loop to handle gaps in interleave-groups. 1389 /// Under optsize and when the trip count is very small we don't allow any 1390 /// iterations to execute in the scalar loop. 1391 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1392 1393 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1394 bool FoldTailByMasking = false; 1395 1396 /// A map holding scalar costs for different vectorization factors. The 1397 /// presence of a cost for an instruction in the mapping indicates that the 1398 /// instruction will be scalarized when vectorizing with the associated 1399 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1400 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1401 1402 /// Holds the instructions known to be uniform after vectorization. 1403 /// The data is collected per VF. 1404 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1405 1406 /// Holds the instructions known to be scalar after vectorization. 1407 /// The data is collected per VF. 1408 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1409 1410 /// Holds the instructions (address computations) that are forced to be 1411 /// scalarized. 1412 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1413 1414 /// Returns the expected difference in cost from scalarizing the expression 1415 /// feeding a predicated instruction \p PredInst. The instructions to 1416 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1417 /// non-negative return value implies the expression will be scalarized. 1418 /// Currently, only single-use chains are considered for scalarization. 1419 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1420 unsigned VF); 1421 1422 /// Collect the instructions that are uniform after vectorization. An 1423 /// instruction is uniform if we represent it with a single scalar value in 1424 /// the vectorized loop corresponding to each vector iteration. Examples of 1425 /// uniform instructions include pointer operands of consecutive or 1426 /// interleaved memory accesses. Note that although uniformity implies an 1427 /// instruction will be scalar, the reverse is not true. In general, a 1428 /// scalarized instruction will be represented by VF scalar values in the 1429 /// vectorized loop, each corresponding to an iteration of the original 1430 /// scalar loop. 1431 void collectLoopUniforms(unsigned VF); 1432 1433 /// Collect the instructions that are scalar after vectorization. An 1434 /// instruction is scalar if it is known to be uniform or will be scalarized 1435 /// during vectorization. Non-uniform scalarized instructions will be 1436 /// represented by VF values in the vectorized loop, each corresponding to an 1437 /// iteration of the original scalar loop. 1438 void collectLoopScalars(unsigned VF); 1439 1440 /// Keeps cost model vectorization decision and cost for instructions. 1441 /// Right now it is used for memory instructions only. 1442 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1443 std::pair<InstWidening, unsigned>>; 1444 1445 DecisionList WideningDecisions; 1446 1447 /// Returns true if \p V is expected to be vectorized and it needs to be 1448 /// extracted. 1449 bool needsExtract(Value *V, unsigned VF) const { 1450 Instruction *I = dyn_cast<Instruction>(V); 1451 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1452 return false; 1453 1454 // Assume we can vectorize V (and hence we need extraction) if the 1455 // scalars are not computed yet. This can happen, because it is called 1456 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1457 // the scalars are collected. That should be a safe assumption in most 1458 // cases, because we check if the operands have vectorizable types 1459 // beforehand in LoopVectorizationLegality. 1460 return Scalars.find(VF) == Scalars.end() || 1461 !isScalarAfterVectorization(I, VF); 1462 }; 1463 1464 /// Returns a range containing only operands needing to be extracted. 1465 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1466 unsigned VF) { 1467 return SmallVector<Value *, 4>(make_filter_range( 1468 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1469 } 1470 1471 public: 1472 /// The loop that we evaluate. 1473 Loop *TheLoop; 1474 1475 /// Predicated scalar evolution analysis. 1476 PredicatedScalarEvolution &PSE; 1477 1478 /// Loop Info analysis. 1479 LoopInfo *LI; 1480 1481 /// Vectorization legality. 1482 LoopVectorizationLegality *Legal; 1483 1484 /// Vector target information. 1485 const TargetTransformInfo &TTI; 1486 1487 /// Target Library Info. 1488 const TargetLibraryInfo *TLI; 1489 1490 /// Demanded bits analysis. 1491 DemandedBits *DB; 1492 1493 /// Assumption cache. 1494 AssumptionCache *AC; 1495 1496 /// Interface to emit optimization remarks. 1497 OptimizationRemarkEmitter *ORE; 1498 1499 const Function *TheFunction; 1500 1501 /// Loop Vectorize Hint. 1502 const LoopVectorizeHints *Hints; 1503 1504 /// The interleave access information contains groups of interleaved accesses 1505 /// with the same stride and close to each other. 1506 InterleavedAccessInfo &InterleaveInfo; 1507 1508 /// Values to ignore in the cost model. 1509 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1510 1511 /// Values to ignore in the cost model when VF > 1. 1512 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1513 }; 1514 1515 } // end namespace llvm 1516 1517 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1518 // vectorization. The loop needs to be annotated with #pragma omp simd 1519 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1520 // vector length information is not provided, vectorization is not considered 1521 // explicit. Interleave hints are not allowed either. These limitations will be 1522 // relaxed in the future. 1523 // Please, note that we are currently forced to abuse the pragma 'clang 1524 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1525 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1526 // provides *explicit vectorization hints* (LV can bypass legal checks and 1527 // assume that vectorization is legal). However, both hints are implemented 1528 // using the same metadata (llvm.loop.vectorize, processed by 1529 // LoopVectorizeHints). This will be fixed in the future when the native IR 1530 // representation for pragma 'omp simd' is introduced. 1531 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1532 OptimizationRemarkEmitter *ORE) { 1533 assert(!OuterLp->empty() && "This is not an outer loop"); 1534 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1535 1536 // Only outer loops with an explicit vectorization hint are supported. 1537 // Unannotated outer loops are ignored. 1538 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1539 return false; 1540 1541 Function *Fn = OuterLp->getHeader()->getParent(); 1542 if (!Hints.allowVectorization(Fn, OuterLp, 1543 true /*VectorizeOnlyWhenForced*/)) { 1544 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1545 return false; 1546 } 1547 1548 if (Hints.getInterleave() > 1) { 1549 // TODO: Interleave support is future work. 1550 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1551 "outer loops.\n"); 1552 Hints.emitRemarkWithHints(); 1553 return false; 1554 } 1555 1556 return true; 1557 } 1558 1559 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1560 OptimizationRemarkEmitter *ORE, 1561 SmallVectorImpl<Loop *> &V) { 1562 // Collect inner loops and outer loops without irreducible control flow. For 1563 // now, only collect outer loops that have explicit vectorization hints. If we 1564 // are stress testing the VPlan H-CFG construction, we collect the outermost 1565 // loop of every loop nest. 1566 if (L.empty() || VPlanBuildStressTest || 1567 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1568 LoopBlocksRPO RPOT(&L); 1569 RPOT.perform(LI); 1570 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1571 V.push_back(&L); 1572 // TODO: Collect inner loops inside marked outer loops in case 1573 // vectorization fails for the outer loop. Do not invoke 1574 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1575 // already known to be reducible. We can use an inherited attribute for 1576 // that. 1577 return; 1578 } 1579 } 1580 for (Loop *InnerL : L) 1581 collectSupportedLoops(*InnerL, LI, ORE, V); 1582 } 1583 1584 namespace { 1585 1586 /// The LoopVectorize Pass. 1587 struct LoopVectorize : public FunctionPass { 1588 /// Pass identification, replacement for typeid 1589 static char ID; 1590 1591 LoopVectorizePass Impl; 1592 1593 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1594 bool VectorizeOnlyWhenForced = false) 1595 : FunctionPass(ID) { 1596 Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; 1597 Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; 1598 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1599 } 1600 1601 bool runOnFunction(Function &F) override { 1602 if (skipFunction(F)) 1603 return false; 1604 1605 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1606 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1607 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1608 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1609 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1610 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1611 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1612 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1613 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1614 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1615 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1616 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1617 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1618 1619 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1620 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1621 1622 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1623 GetLAA, *ORE, PSI); 1624 } 1625 1626 void getAnalysisUsage(AnalysisUsage &AU) const override { 1627 AU.addRequired<AssumptionCacheTracker>(); 1628 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1629 AU.addRequired<DominatorTreeWrapperPass>(); 1630 AU.addRequired<LoopInfoWrapperPass>(); 1631 AU.addRequired<ScalarEvolutionWrapperPass>(); 1632 AU.addRequired<TargetTransformInfoWrapperPass>(); 1633 AU.addRequired<AAResultsWrapperPass>(); 1634 AU.addRequired<LoopAccessLegacyAnalysis>(); 1635 AU.addRequired<DemandedBitsWrapperPass>(); 1636 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1637 1638 // We currently do not preserve loopinfo/dominator analyses with outer loop 1639 // vectorization. Until this is addressed, mark these analyses as preserved 1640 // only for non-VPlan-native path. 1641 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1642 if (!EnableVPlanNativePath) { 1643 AU.addPreserved<LoopInfoWrapperPass>(); 1644 AU.addPreserved<DominatorTreeWrapperPass>(); 1645 } 1646 1647 AU.addPreserved<BasicAAWrapperPass>(); 1648 AU.addPreserved<GlobalsAAWrapperPass>(); 1649 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1650 } 1651 }; 1652 1653 } // end anonymous namespace 1654 1655 //===----------------------------------------------------------------------===// 1656 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1657 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1658 //===----------------------------------------------------------------------===// 1659 1660 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1661 // We need to place the broadcast of invariant variables outside the loop, 1662 // but only if it's proven safe to do so. Else, broadcast will be inside 1663 // vector loop body. 1664 Instruction *Instr = dyn_cast<Instruction>(V); 1665 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1666 (!Instr || 1667 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1668 // Place the code for broadcasting invariant variables in the new preheader. 1669 IRBuilder<>::InsertPointGuard Guard(Builder); 1670 if (SafeToHoist) 1671 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1672 1673 // Broadcast the scalar into all locations in the vector. 1674 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1675 1676 return Shuf; 1677 } 1678 1679 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1680 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1681 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1682 "Expected either an induction phi-node or a truncate of it!"); 1683 Value *Start = II.getStartValue(); 1684 1685 // Construct the initial value of the vector IV in the vector loop preheader 1686 auto CurrIP = Builder.saveIP(); 1687 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1688 if (isa<TruncInst>(EntryVal)) { 1689 assert(Start->getType()->isIntegerTy() && 1690 "Truncation requires an integer type"); 1691 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1692 Step = Builder.CreateTrunc(Step, TruncType); 1693 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1694 } 1695 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1696 Value *SteppedStart = 1697 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1698 1699 // We create vector phi nodes for both integer and floating-point induction 1700 // variables. Here, we determine the kind of arithmetic we will perform. 1701 Instruction::BinaryOps AddOp; 1702 Instruction::BinaryOps MulOp; 1703 if (Step->getType()->isIntegerTy()) { 1704 AddOp = Instruction::Add; 1705 MulOp = Instruction::Mul; 1706 } else { 1707 AddOp = II.getInductionOpcode(); 1708 MulOp = Instruction::FMul; 1709 } 1710 1711 // Multiply the vectorization factor by the step using integer or 1712 // floating-point arithmetic as appropriate. 1713 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1714 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1715 1716 // Create a vector splat to use in the induction update. 1717 // 1718 // FIXME: If the step is non-constant, we create the vector splat with 1719 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1720 // handle a constant vector splat. 1721 Value *SplatVF = isa<Constant>(Mul) 1722 ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) 1723 : Builder.CreateVectorSplat(VF, Mul); 1724 Builder.restoreIP(CurrIP); 1725 1726 // We may need to add the step a number of times, depending on the unroll 1727 // factor. The last of those goes into the PHI. 1728 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1729 &*LoopVectorBody->getFirstInsertionPt()); 1730 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1731 Instruction *LastInduction = VecInd; 1732 for (unsigned Part = 0; Part < UF; ++Part) { 1733 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1734 1735 if (isa<TruncInst>(EntryVal)) 1736 addMetadata(LastInduction, EntryVal); 1737 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1738 1739 LastInduction = cast<Instruction>(addFastMathFlag( 1740 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1741 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1742 } 1743 1744 // Move the last step to the end of the latch block. This ensures consistent 1745 // placement of all induction updates. 1746 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1747 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1748 auto *ICmp = cast<Instruction>(Br->getCondition()); 1749 LastInduction->moveBefore(ICmp); 1750 LastInduction->setName("vec.ind.next"); 1751 1752 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1753 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1754 } 1755 1756 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1757 return Cost->isScalarAfterVectorization(I, VF) || 1758 Cost->isProfitableToScalarize(I, VF); 1759 } 1760 1761 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1762 if (shouldScalarizeInstruction(IV)) 1763 return true; 1764 auto isScalarInst = [&](User *U) -> bool { 1765 auto *I = cast<Instruction>(U); 1766 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1767 }; 1768 return llvm::any_of(IV->users(), isScalarInst); 1769 } 1770 1771 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1772 const InductionDescriptor &ID, const Instruction *EntryVal, 1773 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1774 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1775 "Expected either an induction phi-node or a truncate of it!"); 1776 1777 // This induction variable is not the phi from the original loop but the 1778 // newly-created IV based on the proof that casted Phi is equal to the 1779 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1780 // re-uses the same InductionDescriptor that original IV uses but we don't 1781 // have to do any recording in this case - that is done when original IV is 1782 // processed. 1783 if (isa<TruncInst>(EntryVal)) 1784 return; 1785 1786 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1787 if (Casts.empty()) 1788 return; 1789 // Only the first Cast instruction in the Casts vector is of interest. 1790 // The rest of the Casts (if exist) have no uses outside the 1791 // induction update chain itself. 1792 Instruction *CastInst = *Casts.begin(); 1793 if (Lane < UINT_MAX) 1794 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1795 else 1796 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1797 } 1798 1799 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1800 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1801 "Primary induction variable must have an integer type"); 1802 1803 auto II = Legal->getInductionVars()->find(IV); 1804 assert(II != Legal->getInductionVars()->end() && "IV is not an induction"); 1805 1806 auto ID = II->second; 1807 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1808 1809 // The scalar value to broadcast. This will be derived from the canonical 1810 // induction variable. 1811 Value *ScalarIV = nullptr; 1812 1813 // The value from the original loop to which we are mapping the new induction 1814 // variable. 1815 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1816 1817 // True if we have vectorized the induction variable. 1818 auto VectorizedIV = false; 1819 1820 // Determine if we want a scalar version of the induction variable. This is 1821 // true if the induction variable itself is not widened, or if it has at 1822 // least one user in the loop that is not widened. 1823 auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal); 1824 1825 // Generate code for the induction step. Note that induction steps are 1826 // required to be loop-invariant 1827 assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && 1828 "Induction step should be loop invariant"); 1829 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1830 Value *Step = nullptr; 1831 if (PSE.getSE()->isSCEVable(IV->getType())) { 1832 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1833 Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(), 1834 LoopVectorPreHeader->getTerminator()); 1835 } else { 1836 Step = cast<SCEVUnknown>(ID.getStep())->getValue(); 1837 } 1838 1839 // Try to create a new independent vector induction variable. If we can't 1840 // create the phi node, we will splat the scalar induction variable in each 1841 // loop iteration. 1842 if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) { 1843 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1844 VectorizedIV = true; 1845 } 1846 1847 // If we haven't yet vectorized the induction variable, or if we will create 1848 // a scalar one, we need to define the scalar induction variable and step 1849 // values. If we were given a truncation type, truncate the canonical 1850 // induction variable and step. Otherwise, derive these values from the 1851 // induction descriptor. 1852 if (!VectorizedIV || NeedsScalarIV) { 1853 ScalarIV = Induction; 1854 if (IV != OldInduction) { 1855 ScalarIV = IV->getType()->isIntegerTy() 1856 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1857 : Builder.CreateCast(Instruction::SIToFP, Induction, 1858 IV->getType()); 1859 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1860 ScalarIV->setName("offset.idx"); 1861 } 1862 if (Trunc) { 1863 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1864 assert(Step->getType()->isIntegerTy() && 1865 "Truncation requires an integer step"); 1866 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1867 Step = Builder.CreateTrunc(Step, TruncType); 1868 } 1869 } 1870 1871 // If we haven't yet vectorized the induction variable, splat the scalar 1872 // induction variable, and build the necessary step vectors. 1873 // TODO: Don't do it unless the vectorized IV is really required. 1874 if (!VectorizedIV) { 1875 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1876 for (unsigned Part = 0; Part < UF; ++Part) { 1877 Value *EntryPart = 1878 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1879 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1880 if (Trunc) 1881 addMetadata(EntryPart, Trunc); 1882 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1883 } 1884 } 1885 1886 // If an induction variable is only used for counting loop iterations or 1887 // calculating addresses, it doesn't need to be widened. Create scalar steps 1888 // that can be used by instructions we will later scalarize. Note that the 1889 // addition of the scalar steps will not increase the number of instructions 1890 // in the loop in the common case prior to InstCombine. We will be trading 1891 // one vector extract for each scalar step. 1892 if (NeedsScalarIV) 1893 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1894 } 1895 1896 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1897 Instruction::BinaryOps BinOp) { 1898 // Create and check the types. 1899 assert(Val->getType()->isVectorTy() && "Must be a vector"); 1900 int VLen = Val->getType()->getVectorNumElements(); 1901 1902 Type *STy = Val->getType()->getScalarType(); 1903 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1904 "Induction Step must be an integer or FP"); 1905 assert(Step->getType() == STy && "Step has wrong type"); 1906 1907 SmallVector<Constant *, 8> Indices; 1908 1909 if (STy->isIntegerTy()) { 1910 // Create a vector of consecutive numbers from zero to VF. 1911 for (int i = 0; i < VLen; ++i) 1912 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1913 1914 // Add the consecutive indices to the vector value. 1915 Constant *Cv = ConstantVector::get(Indices); 1916 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1917 Step = Builder.CreateVectorSplat(VLen, Step); 1918 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1919 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1920 // which can be found from the original scalar operations. 1921 Step = Builder.CreateMul(Cv, Step); 1922 return Builder.CreateAdd(Val, Step, "induction"); 1923 } 1924 1925 // Floating point induction. 1926 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1927 "Binary Opcode should be specified for FP induction"); 1928 // Create a vector of consecutive numbers from zero to VF. 1929 for (int i = 0; i < VLen; ++i) 1930 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1931 1932 // Add the consecutive indices to the vector value. 1933 Constant *Cv = ConstantVector::get(Indices); 1934 1935 Step = Builder.CreateVectorSplat(VLen, Step); 1936 1937 // Floating point operations had to be 'fast' to enable the induction. 1938 FastMathFlags Flags; 1939 Flags.setFast(); 1940 1941 Value *MulOp = Builder.CreateFMul(Cv, Step); 1942 if (isa<Instruction>(MulOp)) 1943 // Have to check, MulOp may be a constant 1944 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1945 1946 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1947 if (isa<Instruction>(BOp)) 1948 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1949 return BOp; 1950 } 1951 1952 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1953 Instruction *EntryVal, 1954 const InductionDescriptor &ID) { 1955 // We shouldn't have to build scalar steps if we aren't vectorizing. 1956 assert(VF > 1 && "VF should be greater than one"); 1957 1958 // Get the value type and ensure it and the step have the same integer type. 1959 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1960 assert(ScalarIVTy == Step->getType() && 1961 "Val and Step should have the same type"); 1962 1963 // We build scalar steps for both integer and floating-point induction 1964 // variables. Here, we determine the kind of arithmetic we will perform. 1965 Instruction::BinaryOps AddOp; 1966 Instruction::BinaryOps MulOp; 1967 if (ScalarIVTy->isIntegerTy()) { 1968 AddOp = Instruction::Add; 1969 MulOp = Instruction::Mul; 1970 } else { 1971 AddOp = ID.getInductionOpcode(); 1972 MulOp = Instruction::FMul; 1973 } 1974 1975 // Determine the number of scalars we need to generate for each unroll 1976 // iteration. If EntryVal is uniform, we only need to generate the first 1977 // lane. Otherwise, we generate all VF values. 1978 unsigned Lanes = 1979 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 1980 : VF; 1981 // Compute the scalar steps and save the results in VectorLoopValueMap. 1982 for (unsigned Part = 0; Part < UF; ++Part) { 1983 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 1984 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 1985 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 1986 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 1987 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 1988 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 1989 } 1990 } 1991 } 1992 1993 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 1994 assert(V != Induction && "The new induction variable should not be used."); 1995 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 1996 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 1997 1998 // If we have a stride that is replaced by one, do it here. Defer this for 1999 // the VPlan-native path until we start running Legal checks in that path. 2000 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2001 V = ConstantInt::get(V->getType(), 1); 2002 2003 // If we have a vector mapped to this value, return it. 2004 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2005 return VectorLoopValueMap.getVectorValue(V, Part); 2006 2007 // If the value has not been vectorized, check if it has been scalarized 2008 // instead. If it has been scalarized, and we actually need the value in 2009 // vector form, we will construct the vector values on demand. 2010 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2011 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2012 2013 // If we've scalarized a value, that value should be an instruction. 2014 auto *I = cast<Instruction>(V); 2015 2016 // If we aren't vectorizing, we can just copy the scalar map values over to 2017 // the vector map. 2018 if (VF == 1) { 2019 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2020 return ScalarValue; 2021 } 2022 2023 // Get the last scalar instruction we generated for V and Part. If the value 2024 // is known to be uniform after vectorization, this corresponds to lane zero 2025 // of the Part unroll iteration. Otherwise, the last instruction is the one 2026 // we created for the last vector lane of the Part unroll iteration. 2027 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 2028 auto *LastInst = cast<Instruction>( 2029 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2030 2031 // Set the insert point after the last scalarized instruction. This ensures 2032 // the insertelement sequence will directly follow the scalar definitions. 2033 auto OldIP = Builder.saveIP(); 2034 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2035 Builder.SetInsertPoint(&*NewIP); 2036 2037 // However, if we are vectorizing, we need to construct the vector values. 2038 // If the value is known to be uniform after vectorization, we can just 2039 // broadcast the scalar value corresponding to lane zero for each unroll 2040 // iteration. Otherwise, we construct the vector values using insertelement 2041 // instructions. Since the resulting vectors are stored in 2042 // VectorLoopValueMap, we will only generate the insertelements once. 2043 Value *VectorValue = nullptr; 2044 if (Cost->isUniformAfterVectorization(I, VF)) { 2045 VectorValue = getBroadcastInstrs(ScalarValue); 2046 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2047 } else { 2048 // Initialize packing with insertelements to start from undef. 2049 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2050 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2051 for (unsigned Lane = 0; Lane < VF; ++Lane) 2052 packScalarIntoVectorValue(V, {Part, Lane}); 2053 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2054 } 2055 Builder.restoreIP(OldIP); 2056 return VectorValue; 2057 } 2058 2059 // If this scalar is unknown, assume that it is a constant or that it is 2060 // loop invariant. Broadcast V and save the value for future uses. 2061 Value *B = getBroadcastInstrs(V); 2062 VectorLoopValueMap.setVectorValue(V, Part, B); 2063 return B; 2064 } 2065 2066 Value * 2067 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2068 const VPIteration &Instance) { 2069 // If the value is not an instruction contained in the loop, it should 2070 // already be scalar. 2071 if (OrigLoop->isLoopInvariant(V)) 2072 return V; 2073 2074 assert(Instance.Lane > 0 2075 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2076 : true && "Uniform values only have lane zero"); 2077 2078 // If the value from the original loop has not been vectorized, it is 2079 // represented by UF x VF scalar values in the new loop. Return the requested 2080 // scalar value. 2081 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2082 return VectorLoopValueMap.getScalarValue(V, Instance); 2083 2084 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2085 // for the given unroll part. If this entry is not a vector type (i.e., the 2086 // vectorization factor is one), there is no need to generate an 2087 // extractelement instruction. 2088 auto *U = getOrCreateVectorValue(V, Instance.Part); 2089 if (!U->getType()->isVectorTy()) { 2090 assert(VF == 1 && "Value not scalarized has non-vector type"); 2091 return U; 2092 } 2093 2094 // Otherwise, the value from the original loop has been vectorized and is 2095 // represented by UF vector values. Extract and return the requested scalar 2096 // value from the appropriate vector lane. 2097 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2098 } 2099 2100 void InnerLoopVectorizer::packScalarIntoVectorValue( 2101 Value *V, const VPIteration &Instance) { 2102 assert(V != Induction && "The new induction variable should not be used."); 2103 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2104 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2105 2106 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2107 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2108 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2109 Builder.getInt32(Instance.Lane)); 2110 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2111 } 2112 2113 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2114 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2115 SmallVector<Constant *, 8> ShuffleMask; 2116 for (unsigned i = 0; i < VF; ++i) 2117 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 2118 2119 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2120 ConstantVector::get(ShuffleMask), 2121 "reverse"); 2122 } 2123 2124 // Return whether we allow using masked interleave-groups (for dealing with 2125 // strided loads/stores that reside in predicated blocks, or for dealing 2126 // with gaps). 2127 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2128 // If an override option has been passed in for interleaved accesses, use it. 2129 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2130 return EnableMaskedInterleavedMemAccesses; 2131 2132 return TTI.enableMaskedInterleavedAccessVectorization(); 2133 } 2134 2135 // Try to vectorize the interleave group that \p Instr belongs to. 2136 // 2137 // E.g. Translate following interleaved load group (factor = 3): 2138 // for (i = 0; i < N; i+=3) { 2139 // R = Pic[i]; // Member of index 0 2140 // G = Pic[i+1]; // Member of index 1 2141 // B = Pic[i+2]; // Member of index 2 2142 // ... // do something to R, G, B 2143 // } 2144 // To: 2145 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2146 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2147 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2148 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2149 // 2150 // Or translate following interleaved store group (factor = 3): 2151 // for (i = 0; i < N; i+=3) { 2152 // ... do something to R, G, B 2153 // Pic[i] = R; // Member of index 0 2154 // Pic[i+1] = G; // Member of index 1 2155 // Pic[i+2] = B; // Member of index 2 2156 // } 2157 // To: 2158 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2159 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2160 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2161 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2162 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2163 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, 2164 VectorParts *BlockInMask) { 2165 const InterleaveGroup<Instruction> *Group = 2166 Cost->getInterleavedAccessGroup(Instr); 2167 assert(Group && "Fail to get an interleaved access group."); 2168 2169 // Skip if current instruction is not the insert position. 2170 if (Instr != Group->getInsertPos()) 2171 return; 2172 2173 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2174 Value *Ptr = getLoadStorePointerOperand(Instr); 2175 2176 // Prepare for the vector type of the interleaved load/store. 2177 Type *ScalarTy = getMemInstValueType(Instr); 2178 unsigned InterleaveFactor = Group->getFactor(); 2179 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2180 Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr)); 2181 2182 // Prepare for the new pointers. 2183 setDebugLocFromInst(Builder, Ptr); 2184 SmallVector<Value *, 2> NewPtrs; 2185 unsigned Index = Group->getIndex(Instr); 2186 2187 VectorParts Mask; 2188 bool IsMaskForCondRequired = BlockInMask; 2189 if (IsMaskForCondRequired) { 2190 Mask = *BlockInMask; 2191 // TODO: extend the masked interleaved-group support to reversed access. 2192 assert(!Group->isReverse() && "Reversed masked interleave-group " 2193 "not supported."); 2194 } 2195 2196 // If the group is reverse, adjust the index to refer to the last vector lane 2197 // instead of the first. We adjust the index from the first vector lane, 2198 // rather than directly getting the pointer for lane VF - 1, because the 2199 // pointer operand of the interleaved access is supposed to be uniform. For 2200 // uniform instructions, we're only required to generate a value for the 2201 // first vector lane in each unroll iteration. 2202 if (Group->isReverse()) 2203 Index += (VF - 1) * Group->getFactor(); 2204 2205 bool InBounds = false; 2206 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2207 InBounds = gep->isInBounds(); 2208 2209 for (unsigned Part = 0; Part < UF; Part++) { 2210 Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); 2211 2212 // Notice current instruction could be any index. Need to adjust the address 2213 // to the member of index 0. 2214 // 2215 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2216 // b = A[i]; // Member of index 0 2217 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2218 // 2219 // E.g. A[i+1] = a; // Member of index 1 2220 // A[i] = b; // Member of index 0 2221 // A[i+2] = c; // Member of index 2 (Current instruction) 2222 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2223 NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index)); 2224 if (InBounds) 2225 cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true); 2226 2227 // Cast to the vector pointer type. 2228 NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); 2229 } 2230 2231 setDebugLocFromInst(Builder, Instr); 2232 Value *UndefVec = UndefValue::get(VecTy); 2233 2234 Value *MaskForGaps = nullptr; 2235 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2236 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2237 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2238 } 2239 2240 // Vectorize the interleaved load group. 2241 if (isa<LoadInst>(Instr)) { 2242 // For each unroll part, create a wide load for the group. 2243 SmallVector<Value *, 2> NewLoads; 2244 for (unsigned Part = 0; Part < UF; Part++) { 2245 Instruction *NewLoad; 2246 if (IsMaskForCondRequired || MaskForGaps) { 2247 assert(useMaskedInterleavedAccesses(*TTI) && 2248 "masked interleaved groups are not allowed."); 2249 Value *GroupMask = MaskForGaps; 2250 if (IsMaskForCondRequired) { 2251 auto *Undefs = UndefValue::get(Mask[Part]->getType()); 2252 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2253 Value *ShuffledMask = Builder.CreateShuffleVector( 2254 Mask[Part], Undefs, RepMask, "interleaved.mask"); 2255 GroupMask = MaskForGaps 2256 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2257 MaskForGaps) 2258 : ShuffledMask; 2259 } 2260 NewLoad = 2261 Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), 2262 GroupMask, UndefVec, "wide.masked.vec"); 2263 } 2264 else 2265 NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part], 2266 Group->getAlignment(), "wide.vec"); 2267 Group->addMetadata(NewLoad); 2268 NewLoads.push_back(NewLoad); 2269 } 2270 2271 // For each member in the group, shuffle out the appropriate data from the 2272 // wide loads. 2273 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2274 Instruction *Member = Group->getMember(I); 2275 2276 // Skip the gaps in the group. 2277 if (!Member) 2278 continue; 2279 2280 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); 2281 for (unsigned Part = 0; Part < UF; Part++) { 2282 Value *StridedVec = Builder.CreateShuffleVector( 2283 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2284 2285 // If this member has different type, cast the result type. 2286 if (Member->getType() != ScalarTy) { 2287 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2288 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2289 } 2290 2291 if (Group->isReverse()) 2292 StridedVec = reverseVector(StridedVec); 2293 2294 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2295 } 2296 } 2297 return; 2298 } 2299 2300 // The sub vector type for current instruction. 2301 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2302 2303 // Vectorize the interleaved store group. 2304 for (unsigned Part = 0; Part < UF; Part++) { 2305 // Collect the stored vector from each member. 2306 SmallVector<Value *, 4> StoredVecs; 2307 for (unsigned i = 0; i < InterleaveFactor; i++) { 2308 // Interleaved store group doesn't allow a gap, so each index has a member 2309 Instruction *Member = Group->getMember(i); 2310 assert(Member && "Fail to get a member from an interleaved store group"); 2311 2312 Value *StoredVec = getOrCreateVectorValue( 2313 cast<StoreInst>(Member)->getValueOperand(), Part); 2314 if (Group->isReverse()) 2315 StoredVec = reverseVector(StoredVec); 2316 2317 // If this member has different type, cast it to a unified type. 2318 2319 if (StoredVec->getType() != SubVT) 2320 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2321 2322 StoredVecs.push_back(StoredVec); 2323 } 2324 2325 // Concatenate all vectors into a wide vector. 2326 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2327 2328 // Interleave the elements in the wide vector. 2329 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); 2330 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, 2331 "interleaved.vec"); 2332 2333 Instruction *NewStoreInstr; 2334 if (IsMaskForCondRequired) { 2335 auto *Undefs = UndefValue::get(Mask[Part]->getType()); 2336 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2337 Value *ShuffledMask = Builder.CreateShuffleVector( 2338 Mask[Part], Undefs, RepMask, "interleaved.mask"); 2339 NewStoreInstr = Builder.CreateMaskedStore( 2340 IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask); 2341 } 2342 else 2343 NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], 2344 Group->getAlignment()); 2345 2346 Group->addMetadata(NewStoreInstr); 2347 } 2348 } 2349 2350 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2351 VectorParts *BlockInMask) { 2352 // Attempt to issue a wide load. 2353 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2354 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2355 2356 assert((LI || SI) && "Invalid Load/Store instruction"); 2357 2358 LoopVectorizationCostModel::InstWidening Decision = 2359 Cost->getWideningDecision(Instr, VF); 2360 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 2361 "CM decision should be taken at this point"); 2362 if (Decision == LoopVectorizationCostModel::CM_Interleave) 2363 return vectorizeInterleaveGroup(Instr); 2364 2365 Type *ScalarDataTy = getMemInstValueType(Instr); 2366 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2367 Value *Ptr = getLoadStorePointerOperand(Instr); 2368 // An alignment of 0 means target abi alignment. We need to use the scalar's 2369 // target abi alignment in such a case. 2370 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2371 const Align Alignment = 2372 DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); 2373 unsigned AddressSpace = getLoadStoreAddressSpace(Instr); 2374 2375 // Determine if the pointer operand of the access is either consecutive or 2376 // reverse consecutive. 2377 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2378 bool ConsecutiveStride = 2379 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2380 bool CreateGatherScatter = 2381 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2382 2383 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2384 // gather/scatter. Otherwise Decision should have been to Scalarize. 2385 assert((ConsecutiveStride || CreateGatherScatter) && 2386 "The instruction should be scalarized"); 2387 2388 // Handle consecutive loads/stores. 2389 if (ConsecutiveStride) 2390 Ptr = getOrCreateScalarValue(Ptr, {0, 0}); 2391 2392 VectorParts Mask; 2393 bool isMaskRequired = BlockInMask; 2394 if (isMaskRequired) 2395 Mask = *BlockInMask; 2396 2397 bool InBounds = false; 2398 if (auto *gep = dyn_cast<GetElementPtrInst>( 2399 getLoadStorePointerOperand(Instr)->stripPointerCasts())) 2400 InBounds = gep->isInBounds(); 2401 2402 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2403 // Calculate the pointer for the specific unroll-part. 2404 GetElementPtrInst *PartPtr = nullptr; 2405 2406 if (Reverse) { 2407 // If the address is consecutive but reversed, then the 2408 // wide store needs to start at the last vector element. 2409 PartPtr = cast<GetElementPtrInst>( 2410 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2411 PartPtr->setIsInBounds(InBounds); 2412 PartPtr = cast<GetElementPtrInst>( 2413 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2414 PartPtr->setIsInBounds(InBounds); 2415 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2416 Mask[Part] = reverseVector(Mask[Part]); 2417 } else { 2418 PartPtr = cast<GetElementPtrInst>( 2419 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2420 PartPtr->setIsInBounds(InBounds); 2421 } 2422 2423 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2424 }; 2425 2426 // Handle Stores: 2427 if (SI) { 2428 setDebugLocFromInst(Builder, SI); 2429 2430 for (unsigned Part = 0; Part < UF; ++Part) { 2431 Instruction *NewSI = nullptr; 2432 Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); 2433 if (CreateGatherScatter) { 2434 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; 2435 Value *VectorGep = getOrCreateVectorValue(Ptr, Part); 2436 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, 2437 Alignment.value(), MaskPart); 2438 } else { 2439 if (Reverse) { 2440 // If we store to reverse consecutive memory locations, then we need 2441 // to reverse the order of elements in the stored value. 2442 StoredVal = reverseVector(StoredVal); 2443 // We don't want to update the value in the map as it might be used in 2444 // another expression. So don't call resetVectorValue(StoredVal). 2445 } 2446 auto *VecPtr = CreateVecPtr(Part, Ptr); 2447 if (isMaskRequired) 2448 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, 2449 Alignment.value(), Mask[Part]); 2450 else 2451 NewSI = 2452 Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value()); 2453 } 2454 addMetadata(NewSI, SI); 2455 } 2456 return; 2457 } 2458 2459 // Handle loads. 2460 assert(LI && "Must have a load instruction"); 2461 setDebugLocFromInst(Builder, LI); 2462 for (unsigned Part = 0; Part < UF; ++Part) { 2463 Value *NewLI; 2464 if (CreateGatherScatter) { 2465 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; 2466 Value *VectorGep = getOrCreateVectorValue(Ptr, Part); 2467 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart, 2468 nullptr, "wide.masked.gather"); 2469 addMetadata(NewLI, LI); 2470 } else { 2471 auto *VecPtr = CreateVecPtr(Part, Ptr); 2472 if (isMaskRequired) 2473 NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part], 2474 UndefValue::get(DataTy), 2475 "wide.masked.load"); 2476 else 2477 NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(), 2478 "wide.load"); 2479 2480 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2481 addMetadata(NewLI, LI); 2482 if (Reverse) 2483 NewLI = reverseVector(NewLI); 2484 } 2485 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2486 } 2487 } 2488 2489 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2490 const VPIteration &Instance, 2491 bool IfPredicateInstr) { 2492 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2493 2494 setDebugLocFromInst(Builder, Instr); 2495 2496 // Does this instruction return a value ? 2497 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2498 2499 Instruction *Cloned = Instr->clone(); 2500 if (!IsVoidRetTy) 2501 Cloned->setName(Instr->getName() + ".cloned"); 2502 2503 // Replace the operands of the cloned instructions with their scalar 2504 // equivalents in the new loop. 2505 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2506 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); 2507 Cloned->setOperand(op, NewOp); 2508 } 2509 addNewMetadata(Cloned, Instr); 2510 2511 // Place the cloned scalar in the new loop. 2512 Builder.Insert(Cloned); 2513 2514 // Add the cloned scalar to the scalar map entry. 2515 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2516 2517 // If we just cloned a new assumption, add it the assumption cache. 2518 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2519 if (II->getIntrinsicID() == Intrinsic::assume) 2520 AC->registerAssumption(II); 2521 2522 // End if-block. 2523 if (IfPredicateInstr) 2524 PredicatedInstructions.push_back(Cloned); 2525 } 2526 2527 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2528 Value *End, Value *Step, 2529 Instruction *DL) { 2530 BasicBlock *Header = L->getHeader(); 2531 BasicBlock *Latch = L->getLoopLatch(); 2532 // As we're just creating this loop, it's possible no latch exists 2533 // yet. If so, use the header as this will be a single block loop. 2534 if (!Latch) 2535 Latch = Header; 2536 2537 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2538 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2539 setDebugLocFromInst(Builder, OldInst); 2540 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2541 2542 Builder.SetInsertPoint(Latch->getTerminator()); 2543 setDebugLocFromInst(Builder, OldInst); 2544 2545 // Create i+1 and fill the PHINode. 2546 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2547 Induction->addIncoming(Start, L->getLoopPreheader()); 2548 Induction->addIncoming(Next, Latch); 2549 // Create the compare. 2550 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2551 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2552 2553 // Now we have two terminators. Remove the old one from the block. 2554 Latch->getTerminator()->eraseFromParent(); 2555 2556 return Induction; 2557 } 2558 2559 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2560 if (TripCount) 2561 return TripCount; 2562 2563 assert(L && "Create Trip Count for null loop."); 2564 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2565 // Find the loop boundaries. 2566 ScalarEvolution *SE = PSE.getSE(); 2567 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2568 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2569 "Invalid loop count"); 2570 2571 Type *IdxTy = Legal->getWidestInductionType(); 2572 assert(IdxTy && "No type for induction"); 2573 2574 // The exit count might have the type of i64 while the phi is i32. This can 2575 // happen if we have an induction variable that is sign extended before the 2576 // compare. The only way that we get a backedge taken count is that the 2577 // induction variable was signed and as such will not overflow. In such a case 2578 // truncation is legal. 2579 if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > 2580 IdxTy->getPrimitiveSizeInBits()) 2581 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2582 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2583 2584 // Get the total trip count from the count by adding 1. 2585 const SCEV *ExitCount = SE->getAddExpr( 2586 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2587 2588 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2589 2590 // Expand the trip count and place the new instructions in the preheader. 2591 // Notice that the pre-header does not change, only the loop body. 2592 SCEVExpander Exp(*SE, DL, "induction"); 2593 2594 // Count holds the overall loop count (N). 2595 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2596 L->getLoopPreheader()->getTerminator()); 2597 2598 if (TripCount->getType()->isPointerTy()) 2599 TripCount = 2600 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2601 L->getLoopPreheader()->getTerminator()); 2602 2603 return TripCount; 2604 } 2605 2606 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2607 if (VectorTripCount) 2608 return VectorTripCount; 2609 2610 Value *TC = getOrCreateTripCount(L); 2611 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2612 2613 Type *Ty = TC->getType(); 2614 Constant *Step = ConstantInt::get(Ty, VF * UF); 2615 2616 // If the tail is to be folded by masking, round the number of iterations N 2617 // up to a multiple of Step instead of rounding down. This is done by first 2618 // adding Step-1 and then rounding down. Note that it's ok if this addition 2619 // overflows: the vector induction variable will eventually wrap to zero given 2620 // that it starts at zero and its Step is a power of two; the loop will then 2621 // exit, with the last early-exit vector comparison also producing all-true. 2622 if (Cost->foldTailByMasking()) { 2623 assert(isPowerOf2_32(VF * UF) && 2624 "VF*UF must be a power of 2 when folding tail by masking"); 2625 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2626 } 2627 2628 // Now we need to generate the expression for the part of the loop that the 2629 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2630 // iterations are not required for correctness, or N - Step, otherwise. Step 2631 // is equal to the vectorization factor (number of SIMD elements) times the 2632 // unroll factor (number of SIMD instructions). 2633 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2634 2635 // If there is a non-reversed interleaved group that may speculatively access 2636 // memory out-of-bounds, we need to ensure that there will be at least one 2637 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2638 // the trip count, we set the remainder to be equal to the step. If the step 2639 // does not evenly divide the trip count, no adjustment is necessary since 2640 // there will already be scalar iterations. Note that the minimum iterations 2641 // check ensures that N >= Step. 2642 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2643 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2644 R = Builder.CreateSelect(IsZero, Step, R); 2645 } 2646 2647 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2648 2649 return VectorTripCount; 2650 } 2651 2652 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2653 const DataLayout &DL) { 2654 // Verify that V is a vector type with same number of elements as DstVTy. 2655 unsigned VF = DstVTy->getNumElements(); 2656 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2657 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2658 Type *SrcElemTy = SrcVecTy->getElementType(); 2659 Type *DstElemTy = DstVTy->getElementType(); 2660 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2661 "Vector elements must have same size"); 2662 2663 // Do a direct cast if element types are castable. 2664 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2665 return Builder.CreateBitOrPointerCast(V, DstVTy); 2666 } 2667 // V cannot be directly casted to desired vector type. 2668 // May happen when V is a floating point vector but DstVTy is a vector of 2669 // pointers or vice-versa. Handle this using a two-step bitcast using an 2670 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2671 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2672 "Only one type should be a pointer type"); 2673 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2674 "Only one type should be a floating point type"); 2675 Type *IntTy = 2676 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2677 VectorType *VecIntTy = VectorType::get(IntTy, VF); 2678 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2679 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2680 } 2681 2682 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2683 BasicBlock *Bypass) { 2684 Value *Count = getOrCreateTripCount(L); 2685 BasicBlock *BB = L->getLoopPreheader(); 2686 IRBuilder<> Builder(BB->getTerminator()); 2687 2688 // Generate code to check if the loop's trip count is less than VF * UF, or 2689 // equal to it in case a scalar epilogue is required; this implies that the 2690 // vector trip count is zero. This check also covers the case where adding one 2691 // to the backedge-taken count overflowed leading to an incorrect trip count 2692 // of zero. In this case we will also jump to the scalar loop. 2693 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2694 : ICmpInst::ICMP_ULT; 2695 2696 // If tail is to be folded, vector loop takes care of all iterations. 2697 Value *CheckMinIters = Builder.getFalse(); 2698 if (!Cost->foldTailByMasking()) 2699 CheckMinIters = Builder.CreateICmp( 2700 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2701 "min.iters.check"); 2702 2703 BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2704 // Update dominator tree immediately if the generated block is a 2705 // LoopBypassBlock because SCEV expansions to generate loop bypass 2706 // checks may query it before the current function is finished. 2707 DT->addNewBlock(NewBB, BB); 2708 if (L->getParentLoop()) 2709 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2710 ReplaceInstWithInst(BB->getTerminator(), 2711 BranchInst::Create(Bypass, NewBB, CheckMinIters)); 2712 LoopBypassBlocks.push_back(BB); 2713 } 2714 2715 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2716 BasicBlock *BB = L->getLoopPreheader(); 2717 2718 // Generate the code to check that the SCEV assumptions that we made. 2719 // We want the new basic block to start at the first instruction in a 2720 // sequence of instructions that form a check. 2721 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2722 "scev.check"); 2723 Value *SCEVCheck = 2724 Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); 2725 2726 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2727 if (C->isZero()) 2728 return; 2729 2730 assert(!BB->getParent()->hasOptSize() && 2731 "Cannot SCEV check stride or overflow when optimizing for size"); 2732 2733 // Create a new block containing the stride check. 2734 BB->setName("vector.scevcheck"); 2735 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2736 // Update dominator tree immediately if the generated block is a 2737 // LoopBypassBlock because SCEV expansions to generate loop bypass 2738 // checks may query it before the current function is finished. 2739 DT->addNewBlock(NewBB, BB); 2740 if (L->getParentLoop()) 2741 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2742 ReplaceInstWithInst(BB->getTerminator(), 2743 BranchInst::Create(Bypass, NewBB, SCEVCheck)); 2744 LoopBypassBlocks.push_back(BB); 2745 AddedSafetyChecks = true; 2746 } 2747 2748 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2749 // VPlan-native path does not do any analysis for runtime checks currently. 2750 if (EnableVPlanNativePath) 2751 return; 2752 2753 BasicBlock *BB = L->getLoopPreheader(); 2754 2755 // Generate the code that checks in runtime if arrays overlap. We put the 2756 // checks into a separate block to make the more common case of few elements 2757 // faster. 2758 Instruction *FirstCheckInst; 2759 Instruction *MemRuntimeCheck; 2760 std::tie(FirstCheckInst, MemRuntimeCheck) = 2761 Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); 2762 if (!MemRuntimeCheck) 2763 return; 2764 2765 if (BB->getParent()->hasOptSize()) { 2766 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2767 "Cannot emit memory checks when optimizing for size, unless forced " 2768 "to vectorize."); 2769 ORE->emit([&]() { 2770 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2771 L->getStartLoc(), L->getHeader()) 2772 << "Code-size may be reduced by not forcing " 2773 "vectorization, or by source-code modifications " 2774 "eliminating the need for runtime checks " 2775 "(e.g., adding 'restrict')."; 2776 }); 2777 } 2778 2779 // Create a new block containing the memory check. 2780 BB->setName("vector.memcheck"); 2781 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); 2782 // Update dominator tree immediately if the generated block is a 2783 // LoopBypassBlock because SCEV expansions to generate loop bypass 2784 // checks may query it before the current function is finished. 2785 DT->addNewBlock(NewBB, BB); 2786 if (L->getParentLoop()) 2787 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); 2788 ReplaceInstWithInst(BB->getTerminator(), 2789 BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); 2790 LoopBypassBlocks.push_back(BB); 2791 AddedSafetyChecks = true; 2792 2793 // We currently don't use LoopVersioning for the actual loop cloning but we 2794 // still use it to add the noalias metadata. 2795 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2796 PSE.getSE()); 2797 LVer->prepareNoAliasMetadata(); 2798 } 2799 2800 Value *InnerLoopVectorizer::emitTransformedIndex( 2801 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2802 const InductionDescriptor &ID) const { 2803 2804 SCEVExpander Exp(*SE, DL, "induction"); 2805 auto Step = ID.getStep(); 2806 auto StartValue = ID.getStartValue(); 2807 assert(Index->getType() == Step->getType() && 2808 "Index type does not match StepValue type"); 2809 2810 // Note: the IR at this point is broken. We cannot use SE to create any new 2811 // SCEV and then expand it, hoping that SCEV's simplification will give us 2812 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2813 // lead to various SCEV crashes. So all we can do is to use builder and rely 2814 // on InstCombine for future simplifications. Here we handle some trivial 2815 // cases only. 2816 auto CreateAdd = [&B](Value *X, Value *Y) { 2817 assert(X->getType() == Y->getType() && "Types don't match!"); 2818 if (auto *CX = dyn_cast<ConstantInt>(X)) 2819 if (CX->isZero()) 2820 return Y; 2821 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2822 if (CY->isZero()) 2823 return X; 2824 return B.CreateAdd(X, Y); 2825 }; 2826 2827 auto CreateMul = [&B](Value *X, Value *Y) { 2828 assert(X->getType() == Y->getType() && "Types don't match!"); 2829 if (auto *CX = dyn_cast<ConstantInt>(X)) 2830 if (CX->isOne()) 2831 return Y; 2832 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2833 if (CY->isOne()) 2834 return X; 2835 return B.CreateMul(X, Y); 2836 }; 2837 2838 switch (ID.getKind()) { 2839 case InductionDescriptor::IK_IntInduction: { 2840 assert(Index->getType() == StartValue->getType() && 2841 "Index type does not match StartValue type"); 2842 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2843 return B.CreateSub(StartValue, Index); 2844 auto *Offset = CreateMul( 2845 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2846 return CreateAdd(StartValue, Offset); 2847 } 2848 case InductionDescriptor::IK_PtrInduction: { 2849 assert(isa<SCEVConstant>(Step) && 2850 "Expected constant step for pointer induction"); 2851 return B.CreateGEP( 2852 StartValue->getType()->getPointerElementType(), StartValue, 2853 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2854 &*B.GetInsertPoint()))); 2855 } 2856 case InductionDescriptor::IK_FpInduction: { 2857 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2858 auto InductionBinOp = ID.getInductionBinOp(); 2859 assert(InductionBinOp && 2860 (InductionBinOp->getOpcode() == Instruction::FAdd || 2861 InductionBinOp->getOpcode() == Instruction::FSub) && 2862 "Original bin op should be defined for FP induction"); 2863 2864 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2865 2866 // Floating point operations had to be 'fast' to enable the induction. 2867 FastMathFlags Flags; 2868 Flags.setFast(); 2869 2870 Value *MulExp = B.CreateFMul(StepValue, Index); 2871 if (isa<Instruction>(MulExp)) 2872 // We have to check, the MulExp may be a constant. 2873 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2874 2875 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2876 "induction"); 2877 if (isa<Instruction>(BOp)) 2878 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2879 2880 return BOp; 2881 } 2882 case InductionDescriptor::IK_NoInduction: 2883 return nullptr; 2884 } 2885 llvm_unreachable("invalid enum"); 2886 } 2887 2888 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2889 /* 2890 In this function we generate a new loop. The new loop will contain 2891 the vectorized instructions while the old loop will continue to run the 2892 scalar remainder. 2893 2894 [ ] <-- loop iteration number check. 2895 / | 2896 / v 2897 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2898 | / | 2899 | / v 2900 || [ ] <-- vector pre header. 2901 |/ | 2902 | v 2903 | [ ] \ 2904 | [ ]_| <-- vector loop. 2905 | | 2906 | v 2907 | -[ ] <--- middle-block. 2908 | / | 2909 | / v 2910 -|- >[ ] <--- new preheader. 2911 | | 2912 | v 2913 | [ ] \ 2914 | [ ]_| <-- old scalar loop to handle remainder. 2915 \ | 2916 \ v 2917 >[ ] <-- exit block. 2918 ... 2919 */ 2920 2921 BasicBlock *OldBasicBlock = OrigLoop->getHeader(); 2922 BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); 2923 BasicBlock *ExitBlock = OrigLoop->getExitBlock(); 2924 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2925 assert(VectorPH && "Invalid loop structure"); 2926 assert(ExitBlock && "Must have an exit block"); 2927 2928 // Some loops have a single integer induction variable, while other loops 2929 // don't. One example is c++ iterators that often have multiple pointer 2930 // induction variables. In the code below we also support a case where we 2931 // don't have a single induction variable. 2932 // 2933 // We try to obtain an induction variable from the original loop as hard 2934 // as possible. However if we don't find one that: 2935 // - is an integer 2936 // - counts from zero, stepping by one 2937 // - is the size of the widest induction variable type 2938 // then we create a new one. 2939 OldInduction = Legal->getPrimaryInduction(); 2940 Type *IdxTy = Legal->getWidestInductionType(); 2941 2942 // Split the single block loop into the two loop structure described above. 2943 BasicBlock *VecBody = 2944 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); 2945 BasicBlock *MiddleBlock = 2946 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); 2947 BasicBlock *ScalarPH = 2948 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); 2949 2950 // Create and register the new vector loop. 2951 Loop *Lp = LI->AllocateLoop(); 2952 Loop *ParentLoop = OrigLoop->getParentLoop(); 2953 2954 // Insert the new loop into the loop nest and register the new basic blocks 2955 // before calling any utilities such as SCEV that require valid LoopInfo. 2956 if (ParentLoop) { 2957 ParentLoop->addChildLoop(Lp); 2958 ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); 2959 ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); 2960 } else { 2961 LI->addTopLevelLoop(Lp); 2962 } 2963 Lp->addBasicBlockToLoop(VecBody, *LI); 2964 2965 // Find the loop boundaries. 2966 Value *Count = getOrCreateTripCount(Lp); 2967 2968 Value *StartIdx = ConstantInt::get(IdxTy, 0); 2969 2970 // Now, compare the new count to zero. If it is zero skip the vector loop and 2971 // jump to the scalar loop. This check also covers the case where the 2972 // backedge-taken count is uint##_max: adding one to it will overflow leading 2973 // to an incorrect trip count of zero. In this (rare) case we will also jump 2974 // to the scalar loop. 2975 emitMinimumIterationCountCheck(Lp, ScalarPH); 2976 2977 // Generate the code to check any assumptions that we've made for SCEV 2978 // expressions. 2979 emitSCEVChecks(Lp, ScalarPH); 2980 2981 // Generate the code that checks in runtime if arrays overlap. We put the 2982 // checks into a separate block to make the more common case of few elements 2983 // faster. 2984 emitMemRuntimeChecks(Lp, ScalarPH); 2985 2986 // Generate the induction variable. 2987 // The loop step is equal to the vectorization factor (num of SIMD elements) 2988 // times the unroll factor (num of SIMD instructions). 2989 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 2990 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 2991 Induction = 2992 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 2993 getDebugLocFromInstOrOperands(OldInduction)); 2994 2995 // We are going to resume the execution of the scalar loop. 2996 // Go over all of the induction variables that we found and fix the 2997 // PHIs that are left in the scalar version of the loop. 2998 // The starting values of PHI nodes depend on the counter of the last 2999 // iteration in the vectorized loop. 3000 // If we come from a bypass edge then we need to start from the original 3001 // start value. 3002 3003 // This variable saves the new starting index for the scalar loop. It is used 3004 // to test if there are any tail iterations left once the vector loop has 3005 // completed. 3006 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); 3007 for (auto &InductionEntry : *List) { 3008 PHINode *OrigPhi = InductionEntry.first; 3009 InductionDescriptor II = InductionEntry.second; 3010 3011 // Create phi nodes to merge from the backedge-taken check block. 3012 PHINode *BCResumeVal = PHINode::Create( 3013 OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); 3014 // Copy original phi DL over to the new one. 3015 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3016 Value *&EndValue = IVEndValues[OrigPhi]; 3017 if (OrigPhi == OldInduction) { 3018 // We know what the end value is. 3019 EndValue = CountRoundDown; 3020 } else { 3021 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 3022 Type *StepType = II.getStep()->getType(); 3023 Instruction::CastOps CastOp = 3024 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 3025 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 3026 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 3027 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3028 EndValue->setName("ind.end"); 3029 } 3030 3031 // The new PHI merges the original incoming value, in case of a bypass, 3032 // or the value at the end of the vectorized loop. 3033 BCResumeVal->addIncoming(EndValue, MiddleBlock); 3034 3035 // Fix the scalar body counter (PHI node). 3036 // The old induction's phi node in the scalar body needs the truncated 3037 // value. 3038 for (BasicBlock *BB : LoopBypassBlocks) 3039 BCResumeVal->addIncoming(II.getStartValue(), BB); 3040 OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal); 3041 } 3042 3043 // We need the OrigLoop (scalar loop part) latch terminator to help 3044 // produce correct debug info for the middle block BB instructions. 3045 // The legality check stage guarantees that the loop will have a single 3046 // latch. 3047 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3048 "Scalar loop latch terminator isn't a branch"); 3049 BranchInst *ScalarLatchBr = 3050 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3051 3052 // Add a check in the middle block to see if we have completed 3053 // all of the iterations in the first vector loop. 3054 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3055 // If tail is to be folded, we know we don't need to run the remainder. 3056 Value *CmpN = Builder.getTrue(); 3057 if (!Cost->foldTailByMasking()) { 3058 CmpN = 3059 CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3060 CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); 3061 3062 // Here we use the same DebugLoc as the scalar loop latch branch instead 3063 // of the corresponding compare because they may have ended up with 3064 // different line numbers and we want to avoid awkward line stepping while 3065 // debugging. Eg. if the compare has got a line number inside the loop. 3066 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3067 } 3068 3069 BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN); 3070 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3071 ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst); 3072 3073 // Get ready to start creating new instructions into the vectorized body. 3074 Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); 3075 3076 // Save the state. 3077 LoopVectorPreHeader = Lp->getLoopPreheader(); 3078 LoopScalarPreHeader = ScalarPH; 3079 LoopMiddleBlock = MiddleBlock; 3080 LoopExitBlock = ExitBlock; 3081 LoopVectorBody = VecBody; 3082 LoopScalarBody = OldBasicBlock; 3083 3084 Optional<MDNode *> VectorizedLoopID = 3085 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3086 LLVMLoopVectorizeFollowupVectorized}); 3087 if (VectorizedLoopID.hasValue()) { 3088 Lp->setLoopID(VectorizedLoopID.getValue()); 3089 3090 // Do not setAlreadyVectorized if loop attributes have been defined 3091 // explicitly. 3092 return LoopVectorPreHeader; 3093 } 3094 3095 // Keep all loop hints from the original loop on the vector loop (we'll 3096 // replace the vectorizer-specific hints below). 3097 if (MDNode *LID = OrigLoop->getLoopID()) 3098 Lp->setLoopID(LID); 3099 3100 LoopVectorizeHints Hints(Lp, true, *ORE); 3101 Hints.setAlreadyVectorized(); 3102 3103 return LoopVectorPreHeader; 3104 } 3105 3106 // Fix up external users of the induction variable. At this point, we are 3107 // in LCSSA form, with all external PHIs that use the IV having one input value, 3108 // coming from the remainder loop. We need those PHIs to also have a correct 3109 // value for the IV when arriving directly from the middle block. 3110 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3111 const InductionDescriptor &II, 3112 Value *CountRoundDown, Value *EndValue, 3113 BasicBlock *MiddleBlock) { 3114 // There are two kinds of external IV usages - those that use the value 3115 // computed in the last iteration (the PHI) and those that use the penultimate 3116 // value (the value that feeds into the phi from the loop latch). 3117 // We allow both, but they, obviously, have different values. 3118 3119 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3120 3121 DenseMap<Value *, Value *> MissingVals; 3122 3123 // An external user of the last iteration's value should see the value that 3124 // the remainder loop uses to initialize its own IV. 3125 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3126 for (User *U : PostInc->users()) { 3127 Instruction *UI = cast<Instruction>(U); 3128 if (!OrigLoop->contains(UI)) { 3129 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3130 MissingVals[UI] = EndValue; 3131 } 3132 } 3133 3134 // An external user of the penultimate value need to see EndValue - Step. 3135 // The simplest way to get this is to recompute it from the constituent SCEVs, 3136 // that is Start + (Step * (CRD - 1)). 3137 for (User *U : OrigPhi->users()) { 3138 auto *UI = cast<Instruction>(U); 3139 if (!OrigLoop->contains(UI)) { 3140 const DataLayout &DL = 3141 OrigLoop->getHeader()->getModule()->getDataLayout(); 3142 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3143 3144 IRBuilder<> B(MiddleBlock->getTerminator()); 3145 Value *CountMinusOne = B.CreateSub( 3146 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3147 Value *CMO = 3148 !II.getStep()->getType()->isIntegerTy() 3149 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3150 II.getStep()->getType()) 3151 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3152 CMO->setName("cast.cmo"); 3153 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3154 Escape->setName("ind.escape"); 3155 MissingVals[UI] = Escape; 3156 } 3157 } 3158 3159 for (auto &I : MissingVals) { 3160 PHINode *PHI = cast<PHINode>(I.first); 3161 // One corner case we have to handle is two IVs "chasing" each-other, 3162 // that is %IV2 = phi [...], [ %IV1, %latch ] 3163 // In this case, if IV1 has an external use, we need to avoid adding both 3164 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3165 // don't already have an incoming value for the middle block. 3166 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3167 PHI->addIncoming(I.second, MiddleBlock); 3168 } 3169 } 3170 3171 namespace { 3172 3173 struct CSEDenseMapInfo { 3174 static bool canHandle(const Instruction *I) { 3175 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3176 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3177 } 3178 3179 static inline Instruction *getEmptyKey() { 3180 return DenseMapInfo<Instruction *>::getEmptyKey(); 3181 } 3182 3183 static inline Instruction *getTombstoneKey() { 3184 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3185 } 3186 3187 static unsigned getHashValue(const Instruction *I) { 3188 assert(canHandle(I) && "Unknown instruction!"); 3189 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3190 I->value_op_end())); 3191 } 3192 3193 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3194 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3195 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3196 return LHS == RHS; 3197 return LHS->isIdenticalTo(RHS); 3198 } 3199 }; 3200 3201 } // end anonymous namespace 3202 3203 ///Perform cse of induction variable instructions. 3204 static void cse(BasicBlock *BB) { 3205 // Perform simple cse. 3206 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3207 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3208 Instruction *In = &*I++; 3209 3210 if (!CSEDenseMapInfo::canHandle(In)) 3211 continue; 3212 3213 // Check if we can replace this instruction with any of the 3214 // visited instructions. 3215 if (Instruction *V = CSEMap.lookup(In)) { 3216 In->replaceAllUsesWith(V); 3217 In->eraseFromParent(); 3218 continue; 3219 } 3220 3221 CSEMap[In] = In; 3222 } 3223 } 3224 3225 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3226 unsigned VF, 3227 bool &NeedToScalarize) { 3228 Function *F = CI->getCalledFunction(); 3229 StringRef FnName = CI->getCalledFunction()->getName(); 3230 Type *ScalarRetTy = CI->getType(); 3231 SmallVector<Type *, 4> Tys, ScalarTys; 3232 for (auto &ArgOp : CI->arg_operands()) 3233 ScalarTys.push_back(ArgOp->getType()); 3234 3235 // Estimate cost of scalarized vector call. The source operands are assumed 3236 // to be vectors, so we need to extract individual elements from there, 3237 // execute VF scalar calls, and then gather the result into the vector return 3238 // value. 3239 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3240 if (VF == 1) 3241 return ScalarCallCost; 3242 3243 // Compute corresponding vector type for return value and arguments. 3244 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3245 for (Type *ScalarTy : ScalarTys) 3246 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3247 3248 // Compute costs of unpacking argument values for the scalar calls and 3249 // packing the return values to a vector. 3250 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3251 3252 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3253 3254 // If we can't emit a vector call for this function, then the currently found 3255 // cost is the cost we need to return. 3256 NeedToScalarize = true; 3257 if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) 3258 return Cost; 3259 3260 // If the corresponding vector cost is cheaper, return its cost. 3261 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3262 if (VectorCallCost < Cost) { 3263 NeedToScalarize = false; 3264 return VectorCallCost; 3265 } 3266 return Cost; 3267 } 3268 3269 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3270 unsigned VF) { 3271 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3272 assert(ID && "Expected intrinsic call!"); 3273 3274 FastMathFlags FMF; 3275 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3276 FMF = FPMO->getFastMathFlags(); 3277 3278 SmallVector<Value *, 4> Operands(CI->arg_operands()); 3279 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF); 3280 } 3281 3282 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3283 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3284 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3285 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3286 } 3287 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3288 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3289 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3290 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3291 } 3292 3293 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3294 // For every instruction `I` in MinBWs, truncate the operands, create a 3295 // truncated version of `I` and reextend its result. InstCombine runs 3296 // later and will remove any ext/trunc pairs. 3297 SmallPtrSet<Value *, 4> Erased; 3298 for (const auto &KV : Cost->getMinimalBitwidths()) { 3299 // If the value wasn't vectorized, we must maintain the original scalar 3300 // type. The absence of the value from VectorLoopValueMap indicates that it 3301 // wasn't vectorized. 3302 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3303 continue; 3304 for (unsigned Part = 0; Part < UF; ++Part) { 3305 Value *I = getOrCreateVectorValue(KV.first, Part); 3306 if (Erased.find(I) != Erased.end() || I->use_empty() || 3307 !isa<Instruction>(I)) 3308 continue; 3309 Type *OriginalTy = I->getType(); 3310 Type *ScalarTruncatedTy = 3311 IntegerType::get(OriginalTy->getContext(), KV.second); 3312 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, 3313 OriginalTy->getVectorNumElements()); 3314 if (TruncatedTy == OriginalTy) 3315 continue; 3316 3317 IRBuilder<> B(cast<Instruction>(I)); 3318 auto ShrinkOperand = [&](Value *V) -> Value * { 3319 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3320 if (ZI->getSrcTy() == TruncatedTy) 3321 return ZI->getOperand(0); 3322 return B.CreateZExtOrTrunc(V, TruncatedTy); 3323 }; 3324 3325 // The actual instruction modification depends on the instruction type, 3326 // unfortunately. 3327 Value *NewI = nullptr; 3328 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3329 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3330 ShrinkOperand(BO->getOperand(1))); 3331 3332 // Any wrapping introduced by shrinking this operation shouldn't be 3333 // considered undefined behavior. So, we can't unconditionally copy 3334 // arithmetic wrapping flags to NewI. 3335 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3336 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3337 NewI = 3338 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3339 ShrinkOperand(CI->getOperand(1))); 3340 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3341 NewI = B.CreateSelect(SI->getCondition(), 3342 ShrinkOperand(SI->getTrueValue()), 3343 ShrinkOperand(SI->getFalseValue())); 3344 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3345 switch (CI->getOpcode()) { 3346 default: 3347 llvm_unreachable("Unhandled cast!"); 3348 case Instruction::Trunc: 3349 NewI = ShrinkOperand(CI->getOperand(0)); 3350 break; 3351 case Instruction::SExt: 3352 NewI = B.CreateSExtOrTrunc( 3353 CI->getOperand(0), 3354 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3355 break; 3356 case Instruction::ZExt: 3357 NewI = B.CreateZExtOrTrunc( 3358 CI->getOperand(0), 3359 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3360 break; 3361 } 3362 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3363 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); 3364 auto *O0 = B.CreateZExtOrTrunc( 3365 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3366 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); 3367 auto *O1 = B.CreateZExtOrTrunc( 3368 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3369 3370 NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); 3371 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3372 // Don't do anything with the operands, just extend the result. 3373 continue; 3374 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3375 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements(); 3376 auto *O0 = B.CreateZExtOrTrunc( 3377 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3378 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3379 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3380 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3381 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements(); 3382 auto *O0 = B.CreateZExtOrTrunc( 3383 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3384 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3385 } else { 3386 // If we don't know what to do, be conservative and don't do anything. 3387 continue; 3388 } 3389 3390 // Lastly, extend the result. 3391 NewI->takeName(cast<Instruction>(I)); 3392 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3393 I->replaceAllUsesWith(Res); 3394 cast<Instruction>(I)->eraseFromParent(); 3395 Erased.insert(I); 3396 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3397 } 3398 } 3399 3400 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3401 for (const auto &KV : Cost->getMinimalBitwidths()) { 3402 // If the value wasn't vectorized, we must maintain the original scalar 3403 // type. The absence of the value from VectorLoopValueMap indicates that it 3404 // wasn't vectorized. 3405 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3406 continue; 3407 for (unsigned Part = 0; Part < UF; ++Part) { 3408 Value *I = getOrCreateVectorValue(KV.first, Part); 3409 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3410 if (Inst && Inst->use_empty()) { 3411 Value *NewI = Inst->getOperand(0); 3412 Inst->eraseFromParent(); 3413 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3414 } 3415 } 3416 } 3417 } 3418 3419 void InnerLoopVectorizer::fixVectorizedLoop() { 3420 // Insert truncates and extends for any truncated instructions as hints to 3421 // InstCombine. 3422 if (VF > 1) 3423 truncateToMinimalBitwidths(); 3424 3425 // Fix widened non-induction PHIs by setting up the PHI operands. 3426 if (OrigPHIsToFix.size()) { 3427 assert(EnableVPlanNativePath && 3428 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3429 fixNonInductionPHIs(); 3430 } 3431 3432 // At this point every instruction in the original loop is widened to a 3433 // vector form. Now we need to fix the recurrences in the loop. These PHI 3434 // nodes are currently empty because we did not want to introduce cycles. 3435 // This is the second stage of vectorizing recurrences. 3436 fixCrossIterationPHIs(); 3437 3438 // Update the dominator tree. 3439 // 3440 // FIXME: After creating the structure of the new loop, the dominator tree is 3441 // no longer up-to-date, and it remains that way until we update it 3442 // here. An out-of-date dominator tree is problematic for SCEV, 3443 // because SCEVExpander uses it to guide code generation. The 3444 // vectorizer use SCEVExpanders in several places. Instead, we should 3445 // keep the dominator tree up-to-date as we go. 3446 updateAnalysis(); 3447 3448 // Fix-up external users of the induction variables. 3449 for (auto &Entry : *Legal->getInductionVars()) 3450 fixupIVUsers(Entry.first, Entry.second, 3451 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3452 IVEndValues[Entry.first], LoopMiddleBlock); 3453 3454 fixLCSSAPHIs(); 3455 for (Instruction *PI : PredicatedInstructions) 3456 sinkScalarOperands(&*PI); 3457 3458 // Remove redundant induction instructions. 3459 cse(LoopVectorBody); 3460 } 3461 3462 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3463 // In order to support recurrences we need to be able to vectorize Phi nodes. 3464 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3465 // stage #2: We now need to fix the recurrences by adding incoming edges to 3466 // the currently empty PHI nodes. At this point every instruction in the 3467 // original loop is widened to a vector form so we can use them to construct 3468 // the incoming edges. 3469 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3470 // Handle first-order recurrences and reductions that need to be fixed. 3471 if (Legal->isFirstOrderRecurrence(&Phi)) 3472 fixFirstOrderRecurrence(&Phi); 3473 else if (Legal->isReductionVariable(&Phi)) 3474 fixReduction(&Phi); 3475 } 3476 } 3477 3478 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3479 // This is the second phase of vectorizing first-order recurrences. An 3480 // overview of the transformation is described below. Suppose we have the 3481 // following loop. 3482 // 3483 // for (int i = 0; i < n; ++i) 3484 // b[i] = a[i] - a[i - 1]; 3485 // 3486 // There is a first-order recurrence on "a". For this loop, the shorthand 3487 // scalar IR looks like: 3488 // 3489 // scalar.ph: 3490 // s_init = a[-1] 3491 // br scalar.body 3492 // 3493 // scalar.body: 3494 // i = phi [0, scalar.ph], [i+1, scalar.body] 3495 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3496 // s2 = a[i] 3497 // b[i] = s2 - s1 3498 // br cond, scalar.body, ... 3499 // 3500 // In this example, s1 is a recurrence because it's value depends on the 3501 // previous iteration. In the first phase of vectorization, we created a 3502 // temporary value for s1. We now complete the vectorization and produce the 3503 // shorthand vector IR shown below (for VF = 4, UF = 1). 3504 // 3505 // vector.ph: 3506 // v_init = vector(..., ..., ..., a[-1]) 3507 // br vector.body 3508 // 3509 // vector.body 3510 // i = phi [0, vector.ph], [i+4, vector.body] 3511 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3512 // v2 = a[i, i+1, i+2, i+3]; 3513 // v3 = vector(v1(3), v2(0, 1, 2)) 3514 // b[i, i+1, i+2, i+3] = v2 - v3 3515 // br cond, vector.body, middle.block 3516 // 3517 // middle.block: 3518 // x = v2(3) 3519 // br scalar.ph 3520 // 3521 // scalar.ph: 3522 // s_init = phi [x, middle.block], [a[-1], otherwise] 3523 // br scalar.body 3524 // 3525 // After execution completes the vector loop, we extract the next value of 3526 // the recurrence (x) to use as the initial value in the scalar loop. 3527 3528 // Get the original loop preheader and single loop latch. 3529 auto *Preheader = OrigLoop->getLoopPreheader(); 3530 auto *Latch = OrigLoop->getLoopLatch(); 3531 3532 // Get the initial and previous values of the scalar recurrence. 3533 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3534 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3535 3536 // Create a vector from the initial value. 3537 auto *VectorInit = ScalarInit; 3538 if (VF > 1) { 3539 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3540 VectorInit = Builder.CreateInsertElement( 3541 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3542 Builder.getInt32(VF - 1), "vector.recur.init"); 3543 } 3544 3545 // We constructed a temporary phi node in the first phase of vectorization. 3546 // This phi node will eventually be deleted. 3547 Builder.SetInsertPoint( 3548 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3549 3550 // Create a phi node for the new recurrence. The current value will either be 3551 // the initial value inserted into a vector or loop-varying vector value. 3552 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3553 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3554 3555 // Get the vectorized previous value of the last part UF - 1. It appears last 3556 // among all unrolled iterations, due to the order of their construction. 3557 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3558 3559 // Find and set the insertion point after the previous value if it is an 3560 // instruction. 3561 BasicBlock::iterator InsertPt; 3562 // Note that the previous value may have been constant-folded so it is not 3563 // guaranteed to be an instruction in the vector loop. 3564 // FIXME: Loop invariant values do not form recurrences. We should deal with 3565 // them earlier. 3566 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3567 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3568 else { 3569 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3570 if (isa<PHINode>(PreviousLastPart)) 3571 // If the previous value is a phi node, we should insert after all the phi 3572 // nodes in the block containing the PHI to avoid breaking basic block 3573 // verification. Note that the basic block may be different to 3574 // LoopVectorBody, in case we predicate the loop. 3575 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3576 else 3577 InsertPt = ++PreviousInst->getIterator(); 3578 } 3579 Builder.SetInsertPoint(&*InsertPt); 3580 3581 // We will construct a vector for the recurrence by combining the values for 3582 // the current and previous iterations. This is the required shuffle mask. 3583 SmallVector<Constant *, 8> ShuffleMask(VF); 3584 ShuffleMask[0] = Builder.getInt32(VF - 1); 3585 for (unsigned I = 1; I < VF; ++I) 3586 ShuffleMask[I] = Builder.getInt32(I + VF - 1); 3587 3588 // The vector from which to take the initial value for the current iteration 3589 // (actual or unrolled). Initially, this is the vector phi node. 3590 Value *Incoming = VecPhi; 3591 3592 // Shuffle the current and previous vector and update the vector parts. 3593 for (unsigned Part = 0; Part < UF; ++Part) { 3594 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3595 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3596 auto *Shuffle = 3597 VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3598 ConstantVector::get(ShuffleMask)) 3599 : Incoming; 3600 PhiPart->replaceAllUsesWith(Shuffle); 3601 cast<Instruction>(PhiPart)->eraseFromParent(); 3602 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3603 Incoming = PreviousPart; 3604 } 3605 3606 // Fix the latch value of the new recurrence in the vector loop. 3607 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3608 3609 // Extract the last vector element in the middle block. This will be the 3610 // initial value for the recurrence when jumping to the scalar loop. 3611 auto *ExtractForScalar = Incoming; 3612 if (VF > 1) { 3613 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3614 ExtractForScalar = Builder.CreateExtractElement( 3615 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3616 } 3617 // Extract the second last element in the middle block if the 3618 // Phi is used outside the loop. We need to extract the phi itself 3619 // and not the last element (the phi update in the current iteration). This 3620 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3621 // when the scalar loop is not run at all. 3622 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3623 if (VF > 1) 3624 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3625 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3626 // When loop is unrolled without vectorizing, initialize 3627 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3628 // `Incoming`. This is analogous to the vectorized case above: extracting the 3629 // second last element when VF > 1. 3630 else if (UF > 1) 3631 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3632 3633 // Fix the initial value of the original recurrence in the scalar loop. 3634 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3635 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3636 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3637 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3638 Start->addIncoming(Incoming, BB); 3639 } 3640 3641 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3642 Phi->setName("scalar.recur"); 3643 3644 // Finally, fix users of the recurrence outside the loop. The users will need 3645 // either the last value of the scalar recurrence or the last value of the 3646 // vector recurrence we extracted in the middle block. Since the loop is in 3647 // LCSSA form, we just need to find all the phi nodes for the original scalar 3648 // recurrence in the exit block, and then add an edge for the middle block. 3649 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3650 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3651 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3652 } 3653 } 3654 } 3655 3656 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3657 Constant *Zero = Builder.getInt32(0); 3658 3659 // Get it's reduction variable descriptor. 3660 assert(Legal->isReductionVariable(Phi) && 3661 "Unable to find the reduction variable"); 3662 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi]; 3663 3664 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3665 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3666 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3667 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3668 RdxDesc.getMinMaxRecurrenceKind(); 3669 setDebugLocFromInst(Builder, ReductionStartValue); 3670 3671 // We need to generate a reduction vector from the incoming scalar. 3672 // To do so, we need to generate the 'identity' vector and override 3673 // one of the elements with the incoming scalar reduction. We need 3674 // to do it in the vector-loop preheader. 3675 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3676 3677 // This is the vector-clone of the value that leaves the loop. 3678 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3679 3680 // Find the reduction identity variable. Zero for addition, or, xor, 3681 // one for multiplication, -1 for And. 3682 Value *Identity; 3683 Value *VectorStart; 3684 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3685 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3686 // MinMax reduction have the start value as their identify. 3687 if (VF == 1) { 3688 VectorStart = Identity = ReductionStartValue; 3689 } else { 3690 VectorStart = Identity = 3691 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3692 } 3693 } else { 3694 // Handle other reduction kinds: 3695 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3696 RK, VecTy->getScalarType()); 3697 if (VF == 1) { 3698 Identity = Iden; 3699 // This vector is the Identity vector where the first element is the 3700 // incoming scalar reduction. 3701 VectorStart = ReductionStartValue; 3702 } else { 3703 Identity = ConstantVector::getSplat(VF, Iden); 3704 3705 // This vector is the Identity vector where the first element is the 3706 // incoming scalar reduction. 3707 VectorStart = 3708 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3709 } 3710 } 3711 3712 // Fix the vector-loop phi. 3713 3714 // Reductions do not have to start at zero. They can start with 3715 // any loop invariant values. 3716 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3717 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3718 for (unsigned Part = 0; Part < UF; ++Part) { 3719 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3720 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3721 // Make sure to add the reduction stat value only to the 3722 // first unroll part. 3723 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3724 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3725 cast<PHINode>(VecRdxPhi) 3726 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3727 } 3728 3729 // Before each round, move the insertion point right between 3730 // the PHIs and the values we are going to write. 3731 // This allows us to write both PHINodes and the extractelement 3732 // instructions. 3733 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3734 3735 setDebugLocFromInst(Builder, LoopExitInst); 3736 3737 // If tail is folded by masking, the vector value to leave the loop should be 3738 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3739 // instead of the former. 3740 if (Cost->foldTailByMasking()) { 3741 for (unsigned Part = 0; Part < UF; ++Part) { 3742 Value *VecLoopExitInst = 3743 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3744 Value *Sel = nullptr; 3745 for (User *U : VecLoopExitInst->users()) { 3746 if (isa<SelectInst>(U)) { 3747 assert(!Sel && "Reduction exit feeding two selects"); 3748 Sel = U; 3749 } else 3750 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3751 } 3752 assert(Sel && "Reduction exit feeds no select"); 3753 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3754 } 3755 } 3756 3757 // If the vector reduction can be performed in a smaller type, we truncate 3758 // then extend the loop exit value to enable InstCombine to evaluate the 3759 // entire expression in the smaller type. 3760 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3761 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3762 Builder.SetInsertPoint( 3763 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3764 VectorParts RdxParts(UF); 3765 for (unsigned Part = 0; Part < UF; ++Part) { 3766 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3767 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3768 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3769 : Builder.CreateZExt(Trunc, VecTy); 3770 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3771 UI != RdxParts[Part]->user_end();) 3772 if (*UI != Trunc) { 3773 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3774 RdxParts[Part] = Extnd; 3775 } else { 3776 ++UI; 3777 } 3778 } 3779 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3780 for (unsigned Part = 0; Part < UF; ++Part) { 3781 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3782 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3783 } 3784 } 3785 3786 // Reduce all of the unrolled parts into a single vector. 3787 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3788 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3789 3790 // The middle block terminator has already been assigned a DebugLoc here (the 3791 // OrigLoop's single latch terminator). We want the whole middle block to 3792 // appear to execute on this line because: (a) it is all compiler generated, 3793 // (b) these instructions are always executed after evaluating the latch 3794 // conditional branch, and (c) other passes may add new predecessors which 3795 // terminate on this line. This is the easiest way to ensure we don't 3796 // accidentally cause an extra step back into the loop while debugging. 3797 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3798 for (unsigned Part = 1; Part < UF; ++Part) { 3799 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3800 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3801 // Floating point operations had to be 'fast' to enable the reduction. 3802 ReducedPartRdx = addFastMathFlag( 3803 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3804 ReducedPartRdx, "bin.rdx"), 3805 RdxDesc.getFastMathFlags()); 3806 else 3807 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3808 RdxPart); 3809 } 3810 3811 if (VF > 1) { 3812 bool NoNaN = Legal->hasFunNoNaNAttr(); 3813 ReducedPartRdx = 3814 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3815 // If the reduction can be performed in a smaller type, we need to extend 3816 // the reduction to the wider type before we branch to the original loop. 3817 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3818 ReducedPartRdx = 3819 RdxDesc.isSigned() 3820 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3821 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3822 } 3823 3824 // Create a phi node that merges control-flow from the backedge-taken check 3825 // block and the middle block. 3826 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3827 LoopScalarPreHeader->getTerminator()); 3828 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3829 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3830 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3831 3832 // Now, we need to fix the users of the reduction variable 3833 // inside and outside of the scalar remainder loop. 3834 // We know that the loop is in LCSSA form. We need to update the 3835 // PHI nodes in the exit blocks. 3836 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3837 // All PHINodes need to have a single entry edge, or two if 3838 // we already fixed them. 3839 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3840 3841 // We found a reduction value exit-PHI. Update it with the 3842 // incoming bypass edge. 3843 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3844 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3845 } // end of the LCSSA phi scan. 3846 3847 // Fix the scalar loop reduction variable with the incoming reduction sum 3848 // from the vector body and from the backedge value. 3849 int IncomingEdgeBlockIdx = 3850 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3851 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3852 // Pick the other block. 3853 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3854 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3855 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3856 } 3857 3858 void InnerLoopVectorizer::fixLCSSAPHIs() { 3859 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3860 if (LCSSAPhi.getNumIncomingValues() == 1) { 3861 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3862 // Non-instruction incoming values will have only one value. 3863 unsigned LastLane = 0; 3864 if (isa<Instruction>(IncomingValue)) 3865 LastLane = Cost->isUniformAfterVectorization( 3866 cast<Instruction>(IncomingValue), VF) 3867 ? 0 3868 : VF - 1; 3869 // Can be a loop invariant incoming value or the last scalar value to be 3870 // extracted from the vectorized loop. 3871 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3872 Value *lastIncomingValue = 3873 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3874 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3875 } 3876 } 3877 } 3878 3879 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3880 // The basic block and loop containing the predicated instruction. 3881 auto *PredBB = PredInst->getParent(); 3882 auto *VectorLoop = LI->getLoopFor(PredBB); 3883 3884 // Initialize a worklist with the operands of the predicated instruction. 3885 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3886 3887 // Holds instructions that we need to analyze again. An instruction may be 3888 // reanalyzed if we don't yet know if we can sink it or not. 3889 SmallVector<Instruction *, 8> InstsToReanalyze; 3890 3891 // Returns true if a given use occurs in the predicated block. Phi nodes use 3892 // their operands in their corresponding predecessor blocks. 3893 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3894 auto *I = cast<Instruction>(U.getUser()); 3895 BasicBlock *BB = I->getParent(); 3896 if (auto *Phi = dyn_cast<PHINode>(I)) 3897 BB = Phi->getIncomingBlock( 3898 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3899 return BB == PredBB; 3900 }; 3901 3902 // Iteratively sink the scalarized operands of the predicated instruction 3903 // into the block we created for it. When an instruction is sunk, it's 3904 // operands are then added to the worklist. The algorithm ends after one pass 3905 // through the worklist doesn't sink a single instruction. 3906 bool Changed; 3907 do { 3908 // Add the instructions that need to be reanalyzed to the worklist, and 3909 // reset the changed indicator. 3910 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3911 InstsToReanalyze.clear(); 3912 Changed = false; 3913 3914 while (!Worklist.empty()) { 3915 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3916 3917 // We can't sink an instruction if it is a phi node, is already in the 3918 // predicated block, is not in the loop, or may have side effects. 3919 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 3920 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 3921 continue; 3922 3923 // It's legal to sink the instruction if all its uses occur in the 3924 // predicated block. Otherwise, there's nothing to do yet, and we may 3925 // need to reanalyze the instruction. 3926 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 3927 InstsToReanalyze.push_back(I); 3928 continue; 3929 } 3930 3931 // Move the instruction to the beginning of the predicated block, and add 3932 // it's operands to the worklist. 3933 I->moveBefore(&*PredBB->getFirstInsertionPt()); 3934 Worklist.insert(I->op_begin(), I->op_end()); 3935 3936 // The sinking may have enabled other instructions to be sunk, so we will 3937 // need to iterate. 3938 Changed = true; 3939 } 3940 } while (Changed); 3941 } 3942 3943 void InnerLoopVectorizer::fixNonInductionPHIs() { 3944 for (PHINode *OrigPhi : OrigPHIsToFix) { 3945 PHINode *NewPhi = 3946 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 3947 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 3948 3949 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 3950 predecessors(OrigPhi->getParent())); 3951 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 3952 predecessors(NewPhi->getParent())); 3953 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 3954 "Scalar and Vector BB should have the same number of predecessors"); 3955 3956 // The insertion point in Builder may be invalidated by the time we get 3957 // here. Force the Builder insertion point to something valid so that we do 3958 // not run into issues during insertion point restore in 3959 // getOrCreateVectorValue calls below. 3960 Builder.SetInsertPoint(NewPhi); 3961 3962 // The predecessor order is preserved and we can rely on mapping between 3963 // scalar and vector block predecessors. 3964 for (unsigned i = 0; i < NumIncomingValues; ++i) { 3965 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 3966 3967 // When looking up the new scalar/vector values to fix up, use incoming 3968 // values from original phi. 3969 Value *ScIncV = 3970 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 3971 3972 // Scalar incoming value may need a broadcast 3973 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 3974 NewPhi->addIncoming(NewIncV, NewPredBB); 3975 } 3976 } 3977 } 3978 3979 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, 3980 unsigned VF, bool IsPtrLoopInvariant, 3981 SmallBitVector &IsIndexLoopInvariant) { 3982 // Construct a vector GEP by widening the operands of the scalar GEP as 3983 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 3984 // results in a vector of pointers when at least one operand of the GEP 3985 // is vector-typed. Thus, to keep the representation compact, we only use 3986 // vector-typed operands for loop-varying values. 3987 3988 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 3989 // If we are vectorizing, but the GEP has only loop-invariant operands, 3990 // the GEP we build (by only using vector-typed operands for 3991 // loop-varying values) would be a scalar pointer. Thus, to ensure we 3992 // produce a vector of pointers, we need to either arbitrarily pick an 3993 // operand to broadcast, or broadcast a clone of the original GEP. 3994 // Here, we broadcast a clone of the original. 3995 // 3996 // TODO: If at some point we decide to scalarize instructions having 3997 // loop-invariant operands, this special case will no longer be 3998 // required. We would add the scalarization decision to 3999 // collectLoopScalars() and teach getVectorValue() to broadcast 4000 // the lane-zero scalar value. 4001 auto *Clone = Builder.Insert(GEP->clone()); 4002 for (unsigned Part = 0; Part < UF; ++Part) { 4003 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4004 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4005 addMetadata(EntryPart, GEP); 4006 } 4007 } else { 4008 // If the GEP has at least one loop-varying operand, we are sure to 4009 // produce a vector of pointers. But if we are only unrolling, we want 4010 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4011 // produce with the code below will be scalar (if VF == 1) or vector 4012 // (otherwise). Note that for the unroll-only case, we still maintain 4013 // values in the vector mapping with initVector, as we do for other 4014 // instructions. 4015 for (unsigned Part = 0; Part < UF; ++Part) { 4016 // The pointer operand of the new GEP. If it's loop-invariant, we 4017 // won't broadcast it. 4018 auto *Ptr = IsPtrLoopInvariant 4019 ? GEP->getPointerOperand() 4020 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4021 4022 // Collect all the indices for the new GEP. If any index is 4023 // loop-invariant, we won't broadcast it. 4024 SmallVector<Value *, 4> Indices; 4025 for (auto Index : enumerate(GEP->indices())) { 4026 Value *User = Index.value().get(); 4027 if (IsIndexLoopInvariant[Index.index()]) 4028 Indices.push_back(User); 4029 else 4030 Indices.push_back(getOrCreateVectorValue(User, Part)); 4031 } 4032 4033 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4034 // but it should be a vector, otherwise. 4035 auto *NewGEP = 4036 GEP->isInBounds() 4037 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4038 Indices) 4039 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4040 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4041 "NewGEP is not a pointer vector"); 4042 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4043 addMetadata(NewGEP, GEP); 4044 } 4045 } 4046 } 4047 4048 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4049 unsigned VF) { 4050 PHINode *P = cast<PHINode>(PN); 4051 if (EnableVPlanNativePath) { 4052 // Currently we enter here in the VPlan-native path for non-induction 4053 // PHIs where all control flow is uniform. We simply widen these PHIs. 4054 // Create a vector phi with no operands - the vector phi operands will be 4055 // set at the end of vector code generation. 4056 Type *VecTy = 4057 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4058 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4059 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4060 OrigPHIsToFix.push_back(P); 4061 4062 return; 4063 } 4064 4065 assert(PN->getParent() == OrigLoop->getHeader() && 4066 "Non-header phis should have been handled elsewhere"); 4067 4068 // In order to support recurrences we need to be able to vectorize Phi nodes. 4069 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4070 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4071 // this value when we vectorize all of the instructions that use the PHI. 4072 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4073 for (unsigned Part = 0; Part < UF; ++Part) { 4074 // This is phase one of vectorizing PHIs. 4075 Type *VecTy = 4076 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4077 Value *EntryPart = PHINode::Create( 4078 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4079 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4080 } 4081 return; 4082 } 4083 4084 setDebugLocFromInst(Builder, P); 4085 4086 // This PHINode must be an induction variable. 4087 // Make sure that we know about it. 4088 assert(Legal->getInductionVars()->count(P) && "Not an induction variable"); 4089 4090 InductionDescriptor II = Legal->getInductionVars()->lookup(P); 4091 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4092 4093 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4094 // which can be found from the original scalar operations. 4095 switch (II.getKind()) { 4096 case InductionDescriptor::IK_NoInduction: 4097 llvm_unreachable("Unknown induction"); 4098 case InductionDescriptor::IK_IntInduction: 4099 case InductionDescriptor::IK_FpInduction: 4100 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4101 case InductionDescriptor::IK_PtrInduction: { 4102 // Handle the pointer induction variable case. 4103 assert(P->getType()->isPointerTy() && "Unexpected type."); 4104 // This is the normalized GEP that starts counting at zero. 4105 Value *PtrInd = Induction; 4106 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 4107 // Determine the number of scalars we need to generate for each unroll 4108 // iteration. If the instruction is uniform, we only need to generate the 4109 // first lane. Otherwise, we generate all VF values. 4110 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4111 // These are the scalar results. Notice that we don't generate vector GEPs 4112 // because scalar GEPs result in better code. 4113 for (unsigned Part = 0; Part < UF; ++Part) { 4114 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4115 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4116 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4117 Value *SclrGep = 4118 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4119 SclrGep->setName("next.gep"); 4120 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4121 } 4122 } 4123 return; 4124 } 4125 } 4126 } 4127 4128 /// A helper function for checking whether an integer division-related 4129 /// instruction may divide by zero (in which case it must be predicated if 4130 /// executed conditionally in the scalar code). 4131 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4132 /// Non-zero divisors that are non compile-time constants will not be 4133 /// converted into multiplication, so we will still end up scalarizing 4134 /// the division, but can do so w/o predication. 4135 static bool mayDivideByZero(Instruction &I) { 4136 assert((I.getOpcode() == Instruction::UDiv || 4137 I.getOpcode() == Instruction::SDiv || 4138 I.getOpcode() == Instruction::URem || 4139 I.getOpcode() == Instruction::SRem) && 4140 "Unexpected instruction"); 4141 Value *Divisor = I.getOperand(1); 4142 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4143 return !CInt || CInt->isZero(); 4144 } 4145 4146 void InnerLoopVectorizer::widenInstruction(Instruction &I) { 4147 switch (I.getOpcode()) { 4148 case Instruction::Br: 4149 case Instruction::PHI: 4150 case Instruction::GetElementPtr: 4151 llvm_unreachable("This instruction is handled by a different recipe."); 4152 case Instruction::UDiv: 4153 case Instruction::SDiv: 4154 case Instruction::SRem: 4155 case Instruction::URem: 4156 case Instruction::Add: 4157 case Instruction::FAdd: 4158 case Instruction::Sub: 4159 case Instruction::FSub: 4160 case Instruction::FNeg: 4161 case Instruction::Mul: 4162 case Instruction::FMul: 4163 case Instruction::FDiv: 4164 case Instruction::FRem: 4165 case Instruction::Shl: 4166 case Instruction::LShr: 4167 case Instruction::AShr: 4168 case Instruction::And: 4169 case Instruction::Or: 4170 case Instruction::Xor: { 4171 // Just widen unops and binops. 4172 setDebugLocFromInst(Builder, &I); 4173 4174 for (unsigned Part = 0; Part < UF; ++Part) { 4175 SmallVector<Value *, 2> Ops; 4176 for (Value *Op : I.operands()) 4177 Ops.push_back(getOrCreateVectorValue(Op, Part)); 4178 4179 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4180 4181 if (auto *VecOp = dyn_cast<Instruction>(V)) 4182 VecOp->copyIRFlags(&I); 4183 4184 // Use this vector value for all users of the original instruction. 4185 VectorLoopValueMap.setVectorValue(&I, Part, V); 4186 addMetadata(V, &I); 4187 } 4188 4189 break; 4190 } 4191 case Instruction::Select: { 4192 // Widen selects. 4193 // If the selector is loop invariant we can create a select 4194 // instruction with a scalar condition. Otherwise, use vector-select. 4195 auto *SE = PSE.getSE(); 4196 bool InvariantCond = 4197 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); 4198 setDebugLocFromInst(Builder, &I); 4199 4200 // The condition can be loop invariant but still defined inside the 4201 // loop. This means that we can't just use the original 'cond' value. 4202 // We have to take the 'vectorized' value and pick the first lane. 4203 // Instcombine will make this a no-op. 4204 4205 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); 4206 4207 for (unsigned Part = 0; Part < UF; ++Part) { 4208 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); 4209 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); 4210 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); 4211 Value *Sel = 4212 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); 4213 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4214 addMetadata(Sel, &I); 4215 } 4216 4217 break; 4218 } 4219 4220 case Instruction::ICmp: 4221 case Instruction::FCmp: { 4222 // Widen compares. Generate vector compares. 4223 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4224 auto *Cmp = cast<CmpInst>(&I); 4225 setDebugLocFromInst(Builder, Cmp); 4226 for (unsigned Part = 0; Part < UF; ++Part) { 4227 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); 4228 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); 4229 Value *C = nullptr; 4230 if (FCmp) { 4231 // Propagate fast math flags. 4232 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4233 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4234 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4235 } else { 4236 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4237 } 4238 VectorLoopValueMap.setVectorValue(&I, Part, C); 4239 addMetadata(C, &I); 4240 } 4241 4242 break; 4243 } 4244 4245 case Instruction::ZExt: 4246 case Instruction::SExt: 4247 case Instruction::FPToUI: 4248 case Instruction::FPToSI: 4249 case Instruction::FPExt: 4250 case Instruction::PtrToInt: 4251 case Instruction::IntToPtr: 4252 case Instruction::SIToFP: 4253 case Instruction::UIToFP: 4254 case Instruction::Trunc: 4255 case Instruction::FPTrunc: 4256 case Instruction::BitCast: { 4257 auto *CI = cast<CastInst>(&I); 4258 setDebugLocFromInst(Builder, CI); 4259 4260 /// Vectorize casts. 4261 Type *DestTy = 4262 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4263 4264 for (unsigned Part = 0; Part < UF; ++Part) { 4265 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); 4266 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4267 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4268 addMetadata(Cast, &I); 4269 } 4270 break; 4271 } 4272 4273 case Instruction::Call: { 4274 // Ignore dbg intrinsics. 4275 if (isa<DbgInfoIntrinsic>(I)) 4276 break; 4277 setDebugLocFromInst(Builder, &I); 4278 4279 Module *M = I.getParent()->getParent()->getParent(); 4280 auto *CI = cast<CallInst>(&I); 4281 4282 StringRef FnName = CI->getCalledFunction()->getName(); 4283 Function *F = CI->getCalledFunction(); 4284 Type *RetTy = ToVectorTy(CI->getType(), VF); 4285 SmallVector<Type *, 4> Tys; 4286 for (Value *ArgOperand : CI->arg_operands()) 4287 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4288 4289 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4290 4291 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4292 // version of the instruction. 4293 // Is it beneficial to perform intrinsic call compared to lib call? 4294 bool NeedToScalarize; 4295 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4296 bool UseVectorIntrinsic = 4297 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4298 assert((UseVectorIntrinsic || !NeedToScalarize) && 4299 "Instruction should be scalarized elsewhere."); 4300 4301 for (unsigned Part = 0; Part < UF; ++Part) { 4302 SmallVector<Value *, 4> Args; 4303 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 4304 Value *Arg = CI->getArgOperand(i); 4305 // Some intrinsics have a scalar argument - don't replace it with a 4306 // vector. 4307 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) 4308 Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); 4309 Args.push_back(Arg); 4310 } 4311 4312 Function *VectorF; 4313 if (UseVectorIntrinsic) { 4314 // Use vector version of the intrinsic. 4315 Type *TysForDecl[] = {CI->getType()}; 4316 if (VF > 1) 4317 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4318 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4319 } else { 4320 // Use vector version of the library call. 4321 StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); 4322 assert(!VFnName.empty() && "Vector function name is empty."); 4323 VectorF = M->getFunction(VFnName); 4324 if (!VectorF) { 4325 // Generate a declaration 4326 FunctionType *FTy = FunctionType::get(RetTy, Tys, false); 4327 VectorF = 4328 Function::Create(FTy, Function::ExternalLinkage, VFnName, M); 4329 VectorF->copyAttributesFrom(F); 4330 } 4331 } 4332 assert(VectorF && "Can't create vector function."); 4333 4334 SmallVector<OperandBundleDef, 1> OpBundles; 4335 CI->getOperandBundlesAsDefs(OpBundles); 4336 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4337 4338 if (isa<FPMathOperator>(V)) 4339 V->copyFastMathFlags(CI); 4340 4341 VectorLoopValueMap.setVectorValue(&I, Part, V); 4342 addMetadata(V, &I); 4343 } 4344 4345 break; 4346 } 4347 4348 default: 4349 // This instruction is not vectorized by simple widening. 4350 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4351 llvm_unreachable("Unhandled instruction!"); 4352 } // end of switch. 4353 } 4354 4355 void InnerLoopVectorizer::updateAnalysis() { 4356 // Forget the original basic block. 4357 PSE.getSE()->forgetLoop(OrigLoop); 4358 4359 // DT is not kept up-to-date for outer loop vectorization 4360 if (EnableVPlanNativePath) 4361 return; 4362 4363 // Update the dominator tree information. 4364 assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && 4365 "Entry does not dominate exit."); 4366 4367 DT->addNewBlock(LoopMiddleBlock, 4368 LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 4369 DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); 4370 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); 4371 DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); 4372 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 4373 } 4374 4375 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4376 // We should not collect Scalars more than once per VF. Right now, this 4377 // function is called from collectUniformsAndScalars(), which already does 4378 // this check. Collecting Scalars for VF=1 does not make any sense. 4379 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4380 "This function should not be visited twice for the same VF"); 4381 4382 SmallSetVector<Instruction *, 8> Worklist; 4383 4384 // These sets are used to seed the analysis with pointers used by memory 4385 // accesses that will remain scalar. 4386 SmallSetVector<Instruction *, 8> ScalarPtrs; 4387 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4388 4389 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4390 // The pointer operands of loads and stores will be scalar as long as the 4391 // memory access is not a gather or scatter operation. The value operand of a 4392 // store will remain scalar if the store is scalarized. 4393 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4394 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4395 assert(WideningDecision != CM_Unknown && 4396 "Widening decision should be ready at this moment"); 4397 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4398 if (Ptr == Store->getValueOperand()) 4399 return WideningDecision == CM_Scalarize; 4400 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4401 "Ptr is neither a value or pointer operand"); 4402 return WideningDecision != CM_GatherScatter; 4403 }; 4404 4405 // A helper that returns true if the given value is a bitcast or 4406 // getelementptr instruction contained in the loop. 4407 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4408 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4409 isa<GetElementPtrInst>(V)) && 4410 !TheLoop->isLoopInvariant(V); 4411 }; 4412 4413 // A helper that evaluates a memory access's use of a pointer. If the use 4414 // will be a scalar use, and the pointer is only used by memory accesses, we 4415 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4416 // PossibleNonScalarPtrs. 4417 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4418 // We only care about bitcast and getelementptr instructions contained in 4419 // the loop. 4420 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4421 return; 4422 4423 // If the pointer has already been identified as scalar (e.g., if it was 4424 // also identified as uniform), there's nothing to do. 4425 auto *I = cast<Instruction>(Ptr); 4426 if (Worklist.count(I)) 4427 return; 4428 4429 // If the use of the pointer will be a scalar use, and all users of the 4430 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4431 // place the pointer in PossibleNonScalarPtrs. 4432 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4433 return isa<LoadInst>(U) || isa<StoreInst>(U); 4434 })) 4435 ScalarPtrs.insert(I); 4436 else 4437 PossibleNonScalarPtrs.insert(I); 4438 }; 4439 4440 // We seed the scalars analysis with three classes of instructions: (1) 4441 // instructions marked uniform-after-vectorization, (2) bitcast and 4442 // getelementptr instructions used by memory accesses requiring a scalar use, 4443 // and (3) pointer induction variables and their update instructions (we 4444 // currently only scalarize these). 4445 // 4446 // (1) Add to the worklist all instructions that have been identified as 4447 // uniform-after-vectorization. 4448 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4449 4450 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4451 // memory accesses requiring a scalar use. The pointer operands of loads and 4452 // stores will be scalar as long as the memory accesses is not a gather or 4453 // scatter operation. The value operand of a store will remain scalar if the 4454 // store is scalarized. 4455 for (auto *BB : TheLoop->blocks()) 4456 for (auto &I : *BB) { 4457 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4458 evaluatePtrUse(Load, Load->getPointerOperand()); 4459 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4460 evaluatePtrUse(Store, Store->getPointerOperand()); 4461 evaluatePtrUse(Store, Store->getValueOperand()); 4462 } 4463 } 4464 for (auto *I : ScalarPtrs) 4465 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { 4466 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4467 Worklist.insert(I); 4468 } 4469 4470 // (3) Add to the worklist all pointer induction variables and their update 4471 // instructions. 4472 // 4473 // TODO: Once we are able to vectorize pointer induction variables we should 4474 // no longer insert them into the worklist here. 4475 auto *Latch = TheLoop->getLoopLatch(); 4476 for (auto &Induction : *Legal->getInductionVars()) { 4477 auto *Ind = Induction.first; 4478 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4479 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4480 continue; 4481 Worklist.insert(Ind); 4482 Worklist.insert(IndUpdate); 4483 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4484 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4485 << "\n"); 4486 } 4487 4488 // Insert the forced scalars. 4489 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4490 // induction variable when the PHI user is scalarized. 4491 auto ForcedScalar = ForcedScalars.find(VF); 4492 if (ForcedScalar != ForcedScalars.end()) 4493 for (auto *I : ForcedScalar->second) 4494 Worklist.insert(I); 4495 4496 // Expand the worklist by looking through any bitcasts and getelementptr 4497 // instructions we've already identified as scalar. This is similar to the 4498 // expansion step in collectLoopUniforms(); however, here we're only 4499 // expanding to include additional bitcasts and getelementptr instructions. 4500 unsigned Idx = 0; 4501 while (Idx != Worklist.size()) { 4502 Instruction *Dst = Worklist[Idx++]; 4503 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4504 continue; 4505 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4506 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4507 auto *J = cast<Instruction>(U); 4508 return !TheLoop->contains(J) || Worklist.count(J) || 4509 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4510 isScalarUse(J, Src)); 4511 })) { 4512 Worklist.insert(Src); 4513 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4514 } 4515 } 4516 4517 // An induction variable will remain scalar if all users of the induction 4518 // variable and induction variable update remain scalar. 4519 for (auto &Induction : *Legal->getInductionVars()) { 4520 auto *Ind = Induction.first; 4521 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4522 4523 // We already considered pointer induction variables, so there's no reason 4524 // to look at their users again. 4525 // 4526 // TODO: Once we are able to vectorize pointer induction variables we 4527 // should no longer skip over them here. 4528 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4529 continue; 4530 4531 // Determine if all users of the induction variable are scalar after 4532 // vectorization. 4533 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4534 auto *I = cast<Instruction>(U); 4535 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4536 }); 4537 if (!ScalarInd) 4538 continue; 4539 4540 // Determine if all users of the induction variable update instruction are 4541 // scalar after vectorization. 4542 auto ScalarIndUpdate = 4543 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4544 auto *I = cast<Instruction>(U); 4545 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4546 }); 4547 if (!ScalarIndUpdate) 4548 continue; 4549 4550 // The induction variable and its update instruction will remain scalar. 4551 Worklist.insert(Ind); 4552 Worklist.insert(IndUpdate); 4553 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4554 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4555 << "\n"); 4556 } 4557 4558 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4559 } 4560 4561 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4562 if (!blockNeedsPredication(I->getParent())) 4563 return false; 4564 switch(I->getOpcode()) { 4565 default: 4566 break; 4567 case Instruction::Load: 4568 case Instruction::Store: { 4569 if (!Legal->isMaskRequired(I)) 4570 return false; 4571 auto *Ptr = getLoadStorePointerOperand(I); 4572 auto *Ty = getMemInstValueType(I); 4573 // We have already decided how to vectorize this instruction, get that 4574 // result. 4575 if (VF > 1) { 4576 InstWidening WideningDecision = getWideningDecision(I, VF); 4577 assert(WideningDecision != CM_Unknown && 4578 "Widening decision should be ready at this moment"); 4579 return WideningDecision == CM_Scalarize; 4580 } 4581 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4582 return isa<LoadInst>(I) ? 4583 !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty)) 4584 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || isLegalMaskedScatter(Ty)); 4585 } 4586 case Instruction::UDiv: 4587 case Instruction::SDiv: 4588 case Instruction::SRem: 4589 case Instruction::URem: 4590 return mayDivideByZero(*I); 4591 } 4592 return false; 4593 } 4594 4595 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4596 unsigned VF) { 4597 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4598 assert(getWideningDecision(I, VF) == CM_Unknown && 4599 "Decision should not be set yet."); 4600 auto *Group = getInterleavedAccessGroup(I); 4601 assert(Group && "Must have a group."); 4602 4603 // If the instruction's allocated size doesn't equal it's type size, it 4604 // requires padding and will be scalarized. 4605 auto &DL = I->getModule()->getDataLayout(); 4606 auto *ScalarTy = getMemInstValueType(I); 4607 if (hasIrregularType(ScalarTy, DL, VF)) 4608 return false; 4609 4610 // Check if masking is required. 4611 // A Group may need masking for one of two reasons: it resides in a block that 4612 // needs predication, or it was decided to use masking to deal with gaps. 4613 bool PredicatedAccessRequiresMasking = 4614 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4615 bool AccessWithGapsRequiresMasking = 4616 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4617 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4618 return true; 4619 4620 // If masked interleaving is required, we expect that the user/target had 4621 // enabled it, because otherwise it either wouldn't have been created or 4622 // it should have been invalidated by the CostModel. 4623 assert(useMaskedInterleavedAccesses(TTI) && 4624 "Masked interleave-groups for predicated accesses are not enabled."); 4625 4626 auto *Ty = getMemInstValueType(I); 4627 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4628 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4629 : TTI.isLegalMaskedStore(Ty, Alignment); 4630 } 4631 4632 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4633 unsigned VF) { 4634 // Get and ensure we have a valid memory instruction. 4635 LoadInst *LI = dyn_cast<LoadInst>(I); 4636 StoreInst *SI = dyn_cast<StoreInst>(I); 4637 assert((LI || SI) && "Invalid memory instruction"); 4638 4639 auto *Ptr = getLoadStorePointerOperand(I); 4640 4641 // In order to be widened, the pointer should be consecutive, first of all. 4642 if (!Legal->isConsecutivePtr(Ptr)) 4643 return false; 4644 4645 // If the instruction is a store located in a predicated block, it will be 4646 // scalarized. 4647 if (isScalarWithPredication(I)) 4648 return false; 4649 4650 // If the instruction's allocated size doesn't equal it's type size, it 4651 // requires padding and will be scalarized. 4652 auto &DL = I->getModule()->getDataLayout(); 4653 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4654 if (hasIrregularType(ScalarTy, DL, VF)) 4655 return false; 4656 4657 return true; 4658 } 4659 4660 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4661 // We should not collect Uniforms more than once per VF. Right now, 4662 // this function is called from collectUniformsAndScalars(), which 4663 // already does this check. Collecting Uniforms for VF=1 does not make any 4664 // sense. 4665 4666 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4667 "This function should not be visited twice for the same VF"); 4668 4669 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4670 // not analyze again. Uniforms.count(VF) will return 1. 4671 Uniforms[VF].clear(); 4672 4673 // We now know that the loop is vectorizable! 4674 // Collect instructions inside the loop that will remain uniform after 4675 // vectorization. 4676 4677 // Global values, params and instructions outside of current loop are out of 4678 // scope. 4679 auto isOutOfScope = [&](Value *V) -> bool { 4680 Instruction *I = dyn_cast<Instruction>(V); 4681 return (!I || !TheLoop->contains(I)); 4682 }; 4683 4684 SetVector<Instruction *> Worklist; 4685 BasicBlock *Latch = TheLoop->getLoopLatch(); 4686 4687 // Instructions that are scalar with predication must not be considered 4688 // uniform after vectorization, because that would create an erroneous 4689 // replicating region where only a single instance out of VF should be formed. 4690 // TODO: optimize such seldom cases if found important, see PR40816. 4691 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4692 if (isScalarWithPredication(I, VF)) { 4693 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4694 << *I << "\n"); 4695 return; 4696 } 4697 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4698 Worklist.insert(I); 4699 }; 4700 4701 // Start with the conditional branch. If the branch condition is an 4702 // instruction contained in the loop that is only used by the branch, it is 4703 // uniform. 4704 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4705 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4706 addToWorklistIfAllowed(Cmp); 4707 4708 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4709 // are pointers that are treated like consecutive pointers during 4710 // vectorization. The pointer operands of interleaved accesses are an 4711 // example. 4712 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4713 4714 // Holds pointer operands of instructions that are possibly non-uniform. 4715 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4716 4717 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4718 InstWidening WideningDecision = getWideningDecision(I, VF); 4719 assert(WideningDecision != CM_Unknown && 4720 "Widening decision should be ready at this moment"); 4721 4722 return (WideningDecision == CM_Widen || 4723 WideningDecision == CM_Widen_Reverse || 4724 WideningDecision == CM_Interleave); 4725 }; 4726 // Iterate over the instructions in the loop, and collect all 4727 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4728 // that a consecutive-like pointer operand will be scalarized, we collect it 4729 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4730 // getelementptr instruction can be used by both vectorized and scalarized 4731 // memory instructions. For example, if a loop loads and stores from the same 4732 // location, but the store is conditional, the store will be scalarized, and 4733 // the getelementptr won't remain uniform. 4734 for (auto *BB : TheLoop->blocks()) 4735 for (auto &I : *BB) { 4736 // If there's no pointer operand, there's nothing to do. 4737 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4738 if (!Ptr) 4739 continue; 4740 4741 // True if all users of Ptr are memory accesses that have Ptr as their 4742 // pointer operand. 4743 auto UsersAreMemAccesses = 4744 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4745 return getLoadStorePointerOperand(U) == Ptr; 4746 }); 4747 4748 // Ensure the memory instruction will not be scalarized or used by 4749 // gather/scatter, making its pointer operand non-uniform. If the pointer 4750 // operand is used by any instruction other than a memory access, we 4751 // conservatively assume the pointer operand may be non-uniform. 4752 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4753 PossibleNonUniformPtrs.insert(Ptr); 4754 4755 // If the memory instruction will be vectorized and its pointer operand 4756 // is consecutive-like, or interleaving - the pointer operand should 4757 // remain uniform. 4758 else 4759 ConsecutiveLikePtrs.insert(Ptr); 4760 } 4761 4762 // Add to the Worklist all consecutive and consecutive-like pointers that 4763 // aren't also identified as possibly non-uniform. 4764 for (auto *V : ConsecutiveLikePtrs) 4765 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) 4766 addToWorklistIfAllowed(V); 4767 4768 // Expand Worklist in topological order: whenever a new instruction 4769 // is added , its users should be already inside Worklist. It ensures 4770 // a uniform instruction will only be used by uniform instructions. 4771 unsigned idx = 0; 4772 while (idx != Worklist.size()) { 4773 Instruction *I = Worklist[idx++]; 4774 4775 for (auto OV : I->operand_values()) { 4776 // isOutOfScope operands cannot be uniform instructions. 4777 if (isOutOfScope(OV)) 4778 continue; 4779 // First order recurrence Phi's should typically be considered 4780 // non-uniform. 4781 auto *OP = dyn_cast<PHINode>(OV); 4782 if (OP && Legal->isFirstOrderRecurrence(OP)) 4783 continue; 4784 // If all the users of the operand are uniform, then add the 4785 // operand into the uniform worklist. 4786 auto *OI = cast<Instruction>(OV); 4787 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4788 auto *J = cast<Instruction>(U); 4789 return Worklist.count(J) || 4790 (OI == getLoadStorePointerOperand(J) && 4791 isUniformDecision(J, VF)); 4792 })) 4793 addToWorklistIfAllowed(OI); 4794 } 4795 } 4796 4797 // Returns true if Ptr is the pointer operand of a memory access instruction 4798 // I, and I is known to not require scalarization. 4799 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4800 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4801 }; 4802 4803 // For an instruction to be added into Worklist above, all its users inside 4804 // the loop should also be in Worklist. However, this condition cannot be 4805 // true for phi nodes that form a cyclic dependence. We must process phi 4806 // nodes separately. An induction variable will remain uniform if all users 4807 // of the induction variable and induction variable update remain uniform. 4808 // The code below handles both pointer and non-pointer induction variables. 4809 for (auto &Induction : *Legal->getInductionVars()) { 4810 auto *Ind = Induction.first; 4811 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4812 4813 // Determine if all users of the induction variable are uniform after 4814 // vectorization. 4815 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4816 auto *I = cast<Instruction>(U); 4817 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4818 isVectorizedMemAccessUse(I, Ind); 4819 }); 4820 if (!UniformInd) 4821 continue; 4822 4823 // Determine if all users of the induction variable update instruction are 4824 // uniform after vectorization. 4825 auto UniformIndUpdate = 4826 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4827 auto *I = cast<Instruction>(U); 4828 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4829 isVectorizedMemAccessUse(I, IndUpdate); 4830 }); 4831 if (!UniformIndUpdate) 4832 continue; 4833 4834 // The induction variable and its update instruction will remain uniform. 4835 addToWorklistIfAllowed(Ind); 4836 addToWorklistIfAllowed(IndUpdate); 4837 } 4838 4839 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4840 } 4841 4842 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4843 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4844 4845 if (Legal->getRuntimePointerChecking()->Need) { 4846 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4847 "runtime pointer checks needed. Enable vectorization of this " 4848 "loop with '#pragma clang loop vectorize(enable)' when " 4849 "compiling with -Os/-Oz", 4850 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4851 return true; 4852 } 4853 4854 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4855 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4856 "runtime SCEV checks needed. Enable vectorization of this " 4857 "loop with '#pragma clang loop vectorize(enable)' when " 4858 "compiling with -Os/-Oz", 4859 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4860 return true; 4861 } 4862 4863 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4864 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4865 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4866 "runtime stride == 1 checks needed. Enable vectorization of " 4867 "this loop with '#pragma clang loop vectorize(enable)' when " 4868 "compiling with -Os/-Oz", 4869 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4870 return true; 4871 } 4872 4873 return false; 4874 } 4875 4876 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { 4877 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4878 // TODO: It may by useful to do since it's still likely to be dynamically 4879 // uniform if the target can skip. 4880 reportVectorizationFailure( 4881 "Not inserting runtime ptr check for divergent target", 4882 "runtime pointer checks needed. Not enabled for divergent target", 4883 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4884 return None; 4885 } 4886 4887 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4888 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4889 if (TC == 1) { 4890 reportVectorizationFailure("Single iteration (non) loop", 4891 "loop trip count is one, irrelevant for vectorization", 4892 "SingleIterationLoop", ORE, TheLoop); 4893 return None; 4894 } 4895 4896 switch (ScalarEpilogueStatus) { 4897 case CM_ScalarEpilogueAllowed: 4898 return computeFeasibleMaxVF(TC); 4899 case CM_ScalarEpilogueNotNeededUsePredicate: 4900 LLVM_DEBUG( 4901 dbgs() << "LV: vector predicate hint/switch found.\n" 4902 << "LV: Not allowing scalar epilogue, creating predicated " 4903 << "vector loop.\n"); 4904 break; 4905 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4906 // fallthrough as a special case of OptForSize 4907 case CM_ScalarEpilogueNotAllowedOptSize: 4908 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4909 LLVM_DEBUG( 4910 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4911 else 4912 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4913 << "count.\n"); 4914 4915 // Bail if runtime checks are required, which are not good when optimising 4916 // for size. 4917 if (runtimeChecksRequired()) 4918 return None; 4919 break; 4920 } 4921 4922 // Now try the tail folding 4923 4924 // Invalidate interleave groups that require an epilogue if we can't mask 4925 // the interleave-group. 4926 if (!useMaskedInterleavedAccesses(TTI)) 4927 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4928 4929 unsigned MaxVF = computeFeasibleMaxVF(TC); 4930 if (TC > 0 && TC % MaxVF == 0) { 4931 // Accept MaxVF if we do not have a tail. 4932 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4933 return MaxVF; 4934 } 4935 4936 // If we don't know the precise trip count, or if the trip count that we 4937 // found modulo the vectorization factor is not zero, try to fold the tail 4938 // by masking. 4939 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4940 if (Legal->prepareToFoldTailByMasking()) { 4941 FoldTailByMasking = true; 4942 return MaxVF; 4943 } 4944 4945 if (TC == 0) { 4946 reportVectorizationFailure( 4947 "Unable to calculate the loop count due to complex control flow", 4948 "unable to calculate the loop count due to complex control flow", 4949 "UnknownLoopCountComplexCFG", ORE, TheLoop); 4950 return None; 4951 } 4952 4953 reportVectorizationFailure( 4954 "Cannot optimize for size and vectorize at the same time.", 4955 "cannot optimize for size and vectorize at the same time. " 4956 "Enable vectorization of this loop with '#pragma clang loop " 4957 "vectorize(enable)' when compiling with -Os/-Oz", 4958 "NoTailLoopWithOptForSize", ORE, TheLoop); 4959 return None; 4960 } 4961 4962 unsigned 4963 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 4964 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 4965 unsigned SmallestType, WidestType; 4966 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 4967 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 4968 4969 // Get the maximum safe dependence distance in bits computed by LAA. 4970 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 4971 // the memory accesses that is most restrictive (involved in the smallest 4972 // dependence distance). 4973 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 4974 4975 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 4976 4977 unsigned MaxVectorSize = WidestRegister / WidestType; 4978 4979 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 4980 << " / " << WidestType << " bits.\n"); 4981 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 4982 << WidestRegister << " bits.\n"); 4983 4984 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 4985 " into one vector!"); 4986 if (MaxVectorSize == 0) { 4987 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 4988 MaxVectorSize = 1; 4989 return MaxVectorSize; 4990 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 4991 isPowerOf2_32(ConstTripCount)) { 4992 // We need to clamp the VF to be the ConstTripCount. There is no point in 4993 // choosing a higher viable VF as done in the loop below. 4994 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 4995 << ConstTripCount << "\n"); 4996 MaxVectorSize = ConstTripCount; 4997 return MaxVectorSize; 4998 } 4999 5000 unsigned MaxVF = MaxVectorSize; 5001 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5002 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5003 // Collect all viable vectorization factors larger than the default MaxVF 5004 // (i.e. MaxVectorSize). 5005 SmallVector<unsigned, 8> VFs; 5006 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5007 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5008 VFs.push_back(VS); 5009 5010 // For each VF calculate its register usage. 5011 auto RUs = calculateRegisterUsage(VFs); 5012 5013 // Select the largest VF which doesn't require more registers than existing 5014 // ones. 5015 for (int i = RUs.size() - 1; i >= 0; --i) { 5016 bool Selected = true; 5017 for (auto& pair : RUs[i].MaxLocalUsers) { 5018 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5019 if (pair.second > TargetNumRegisters) 5020 Selected = false; 5021 } 5022 if (Selected) { 5023 MaxVF = VFs[i]; 5024 break; 5025 } 5026 } 5027 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5028 if (MaxVF < MinVF) { 5029 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5030 << ") with target's minimum: " << MinVF << '\n'); 5031 MaxVF = MinVF; 5032 } 5033 } 5034 } 5035 return MaxVF; 5036 } 5037 5038 VectorizationFactor 5039 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5040 float Cost = expectedCost(1).first; 5041 const float ScalarCost = Cost; 5042 unsigned Width = 1; 5043 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5044 5045 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5046 if (ForceVectorization && MaxVF > 1) { 5047 // Ignore scalar width, because the user explicitly wants vectorization. 5048 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5049 // evaluation. 5050 Cost = std::numeric_limits<float>::max(); 5051 } 5052 5053 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5054 // Notice that the vector loop needs to be executed less times, so 5055 // we need to divide the cost of the vector loops by the width of 5056 // the vector elements. 5057 VectorizationCostTy C = expectedCost(i); 5058 float VectorCost = C.first / (float)i; 5059 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5060 << " costs: " << (int)VectorCost << ".\n"); 5061 if (!C.second && !ForceVectorization) { 5062 LLVM_DEBUG( 5063 dbgs() << "LV: Not considering vector loop of width " << i 5064 << " because it will not generate any vector instructions.\n"); 5065 continue; 5066 } 5067 if (VectorCost < Cost) { 5068 Cost = VectorCost; 5069 Width = i; 5070 } 5071 } 5072 5073 if (!EnableCondStoresVectorization && NumPredStores) { 5074 reportVectorizationFailure("There are conditional stores.", 5075 "store that is conditionally executed prevents vectorization", 5076 "ConditionalStore", ORE, TheLoop); 5077 Width = 1; 5078 Cost = ScalarCost; 5079 } 5080 5081 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5082 << "LV: Vectorization seems to be not beneficial, " 5083 << "but was forced by a user.\n"); 5084 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5085 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5086 return Factor; 5087 } 5088 5089 std::pair<unsigned, unsigned> 5090 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5091 unsigned MinWidth = -1U; 5092 unsigned MaxWidth = 8; 5093 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5094 5095 // For each block. 5096 for (BasicBlock *BB : TheLoop->blocks()) { 5097 // For each instruction in the loop. 5098 for (Instruction &I : BB->instructionsWithoutDebug()) { 5099 Type *T = I.getType(); 5100 5101 // Skip ignored values. 5102 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) 5103 continue; 5104 5105 // Only examine Loads, Stores and PHINodes. 5106 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5107 continue; 5108 5109 // Examine PHI nodes that are reduction variables. Update the type to 5110 // account for the recurrence type. 5111 if (auto *PN = dyn_cast<PHINode>(&I)) { 5112 if (!Legal->isReductionVariable(PN)) 5113 continue; 5114 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; 5115 T = RdxDesc.getRecurrenceType(); 5116 } 5117 5118 // Examine the stored values. 5119 if (auto *ST = dyn_cast<StoreInst>(&I)) 5120 T = ST->getValueOperand()->getType(); 5121 5122 // Ignore loaded pointer types and stored pointer types that are not 5123 // vectorizable. 5124 // 5125 // FIXME: The check here attempts to predict whether a load or store will 5126 // be vectorized. We only know this for certain after a VF has 5127 // been selected. Here, we assume that if an access can be 5128 // vectorized, it will be. We should also look at extending this 5129 // optimization to non-pointer types. 5130 // 5131 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5132 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5133 continue; 5134 5135 MinWidth = std::min(MinWidth, 5136 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5137 MaxWidth = std::max(MaxWidth, 5138 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5139 } 5140 } 5141 5142 return {MinWidth, MaxWidth}; 5143 } 5144 5145 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5146 unsigned LoopCost) { 5147 // -- The interleave heuristics -- 5148 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5149 // There are many micro-architectural considerations that we can't predict 5150 // at this level. For example, frontend pressure (on decode or fetch) due to 5151 // code size, or the number and capabilities of the execution ports. 5152 // 5153 // We use the following heuristics to select the interleave count: 5154 // 1. If the code has reductions, then we interleave to break the cross 5155 // iteration dependency. 5156 // 2. If the loop is really small, then we interleave to reduce the loop 5157 // overhead. 5158 // 3. We don't interleave if we think that we will spill registers to memory 5159 // due to the increased register pressure. 5160 5161 if (!isScalarEpilogueAllowed()) 5162 return 1; 5163 5164 // We used the distance for the interleave count. 5165 if (Legal->getMaxSafeDepDistBytes() != -1U) 5166 return 1; 5167 5168 // Do not interleave loops with a relatively small known or estimated trip 5169 // count. 5170 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5171 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5172 return 1; 5173 5174 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5175 // We divide by these constants so assume that we have at least one 5176 // instruction that uses at least one register. 5177 for (auto& pair : R.MaxLocalUsers) { 5178 pair.second = std::max(pair.second, 1U); 5179 } 5180 5181 // We calculate the interleave count using the following formula. 5182 // Subtract the number of loop invariants from the number of available 5183 // registers. These registers are used by all of the interleaved instances. 5184 // Next, divide the remaining registers by the number of registers that is 5185 // required by the loop, in order to estimate how many parallel instances 5186 // fit without causing spills. All of this is rounded down if necessary to be 5187 // a power of two. We want power of two interleave count to simplify any 5188 // addressing operations or alignment considerations. 5189 // We also want power of two interleave counts to ensure that the induction 5190 // variable of the vector loop wraps to zero, when tail is folded by masking; 5191 // this currently happens when OptForSize, in which case IC is set to 1 above. 5192 unsigned IC = UINT_MAX; 5193 5194 for (auto& pair : R.MaxLocalUsers) { 5195 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5196 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5197 << " registers of " 5198 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5199 if (VF == 1) { 5200 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5201 TargetNumRegisters = ForceTargetNumScalarRegs; 5202 } else { 5203 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5204 TargetNumRegisters = ForceTargetNumVectorRegs; 5205 } 5206 unsigned MaxLocalUsers = pair.second; 5207 unsigned LoopInvariantRegs = 0; 5208 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5209 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5210 5211 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5212 // Don't count the induction variable as interleaved. 5213 if (EnableIndVarRegisterHeur) { 5214 TmpIC = 5215 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5216 std::max(1U, (MaxLocalUsers - 1))); 5217 } 5218 5219 IC = std::min(IC, TmpIC); 5220 } 5221 5222 // Clamp the interleave ranges to reasonable counts. 5223 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5224 5225 // Check if the user has overridden the max. 5226 if (VF == 1) { 5227 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5228 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5229 } else { 5230 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5231 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5232 } 5233 5234 // If trip count is known or estimated compile time constant, limit the 5235 // interleave count to be less than the trip count divided by VF. 5236 if (BestKnownTC) { 5237 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); 5238 } 5239 5240 // If we did not calculate the cost for VF (because the user selected the VF) 5241 // then we calculate the cost of VF here. 5242 if (LoopCost == 0) 5243 LoopCost = expectedCost(VF).first; 5244 5245 assert(LoopCost && "Non-zero loop cost expected"); 5246 5247 // Clamp the calculated IC to be between the 1 and the max interleave count 5248 // that the target and trip count allows. 5249 if (IC > MaxInterleaveCount) 5250 IC = MaxInterleaveCount; 5251 else if (IC < 1) 5252 IC = 1; 5253 5254 // Interleave if we vectorized this loop and there is a reduction that could 5255 // benefit from interleaving. 5256 if (VF > 1 && !Legal->getReductionVars()->empty()) { 5257 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5258 return IC; 5259 } 5260 5261 // Note that if we've already vectorized the loop we will have done the 5262 // runtime check and so interleaving won't require further checks. 5263 bool InterleavingRequiresRuntimePointerCheck = 5264 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5265 5266 // We want to interleave small loops in order to reduce the loop overhead and 5267 // potentially expose ILP opportunities. 5268 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5269 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5270 // We assume that the cost overhead is 1 and we use the cost model 5271 // to estimate the cost of the loop and interleave until the cost of the 5272 // loop overhead is about 5% of the cost of the loop. 5273 unsigned SmallIC = 5274 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5275 5276 // Interleave until store/load ports (estimated by max interleave count) are 5277 // saturated. 5278 unsigned NumStores = Legal->getNumStores(); 5279 unsigned NumLoads = Legal->getNumLoads(); 5280 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5281 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5282 5283 // If we have a scalar reduction (vector reductions are already dealt with 5284 // by this point), we can increase the critical path length if the loop 5285 // we're interleaving is inside another loop. Limit, by default to 2, so the 5286 // critical path only gets increased by one reduction operation. 5287 if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) { 5288 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5289 SmallIC = std::min(SmallIC, F); 5290 StoresIC = std::min(StoresIC, F); 5291 LoadsIC = std::min(LoadsIC, F); 5292 } 5293 5294 if (EnableLoadStoreRuntimeInterleave && 5295 std::max(StoresIC, LoadsIC) > SmallIC) { 5296 LLVM_DEBUG( 5297 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5298 return std::max(StoresIC, LoadsIC); 5299 } 5300 5301 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5302 return SmallIC; 5303 } 5304 5305 // Interleave if this is a large loop (small loops are already dealt with by 5306 // this point) that could benefit from interleaving. 5307 bool HasReductions = !Legal->getReductionVars()->empty(); 5308 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5309 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5310 return IC; 5311 } 5312 5313 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5314 return 1; 5315 } 5316 5317 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5318 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5319 // This function calculates the register usage by measuring the highest number 5320 // of values that are alive at a single location. Obviously, this is a very 5321 // rough estimation. We scan the loop in a topological order in order and 5322 // assign a number to each instruction. We use RPO to ensure that defs are 5323 // met before their users. We assume that each instruction that has in-loop 5324 // users starts an interval. We record every time that an in-loop value is 5325 // used, so we have a list of the first and last occurrences of each 5326 // instruction. Next, we transpose this data structure into a multi map that 5327 // holds the list of intervals that *end* at a specific location. This multi 5328 // map allows us to perform a linear search. We scan the instructions linearly 5329 // and record each time that a new interval starts, by placing it in a set. 5330 // If we find this value in the multi-map then we remove it from the set. 5331 // The max register usage is the maximum size of the set. 5332 // We also search for instructions that are defined outside the loop, but are 5333 // used inside the loop. We need this number separately from the max-interval 5334 // usage number because when we unroll, loop-invariant values do not take 5335 // more register. 5336 LoopBlocksDFS DFS(TheLoop); 5337 DFS.perform(LI); 5338 5339 RegisterUsage RU; 5340 5341 // Each 'key' in the map opens a new interval. The values 5342 // of the map are the index of the 'last seen' usage of the 5343 // instruction that is the key. 5344 using IntervalMap = DenseMap<Instruction *, unsigned>; 5345 5346 // Maps instruction to its index. 5347 SmallVector<Instruction *, 64> IdxToInstr; 5348 // Marks the end of each interval. 5349 IntervalMap EndPoint; 5350 // Saves the list of instruction indices that are used in the loop. 5351 SmallPtrSet<Instruction *, 8> Ends; 5352 // Saves the list of values that are used in the loop but are 5353 // defined outside the loop, such as arguments and constants. 5354 SmallPtrSet<Value *, 8> LoopInvariants; 5355 5356 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5357 for (Instruction &I : BB->instructionsWithoutDebug()) { 5358 IdxToInstr.push_back(&I); 5359 5360 // Save the end location of each USE. 5361 for (Value *U : I.operands()) { 5362 auto *Instr = dyn_cast<Instruction>(U); 5363 5364 // Ignore non-instruction values such as arguments, constants, etc. 5365 if (!Instr) 5366 continue; 5367 5368 // If this instruction is outside the loop then record it and continue. 5369 if (!TheLoop->contains(Instr)) { 5370 LoopInvariants.insert(Instr); 5371 continue; 5372 } 5373 5374 // Overwrite previous end points. 5375 EndPoint[Instr] = IdxToInstr.size(); 5376 Ends.insert(Instr); 5377 } 5378 } 5379 } 5380 5381 // Saves the list of intervals that end with the index in 'key'. 5382 using InstrList = SmallVector<Instruction *, 2>; 5383 DenseMap<unsigned, InstrList> TransposeEnds; 5384 5385 // Transpose the EndPoints to a list of values that end at each index. 5386 for (auto &Interval : EndPoint) 5387 TransposeEnds[Interval.second].push_back(Interval.first); 5388 5389 SmallPtrSet<Instruction *, 8> OpenIntervals; 5390 5391 // Get the size of the widest register. 5392 unsigned MaxSafeDepDist = -1U; 5393 if (Legal->getMaxSafeDepDistBytes() != -1U) 5394 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5395 unsigned WidestRegister = 5396 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5397 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5398 5399 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5400 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5401 5402 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5403 5404 // A lambda that gets the register usage for the given type and VF. 5405 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5406 if (Ty->isTokenTy()) 5407 return 0U; 5408 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5409 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5410 }; 5411 5412 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5413 Instruction *I = IdxToInstr[i]; 5414 5415 // Remove all of the instructions that end at this location. 5416 InstrList &List = TransposeEnds[i]; 5417 for (Instruction *ToRemove : List) 5418 OpenIntervals.erase(ToRemove); 5419 5420 // Ignore instructions that are never used within the loop. 5421 if (Ends.find(I) == Ends.end()) 5422 continue; 5423 5424 // Skip ignored values. 5425 if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) 5426 continue; 5427 5428 // For each VF find the maximum usage of registers. 5429 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5430 // Count the number of live intervals. 5431 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5432 5433 if (VFs[j] == 1) { 5434 for (auto Inst : OpenIntervals) { 5435 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5436 if (RegUsage.find(ClassID) == RegUsage.end()) 5437 RegUsage[ClassID] = 1; 5438 else 5439 RegUsage[ClassID] += 1; 5440 } 5441 } else { 5442 collectUniformsAndScalars(VFs[j]); 5443 for (auto Inst : OpenIntervals) { 5444 // Skip ignored values for VF > 1. 5445 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) 5446 continue; 5447 if (isScalarAfterVectorization(Inst, VFs[j])) { 5448 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5449 if (RegUsage.find(ClassID) == RegUsage.end()) 5450 RegUsage[ClassID] = 1; 5451 else 5452 RegUsage[ClassID] += 1; 5453 } else { 5454 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5455 if (RegUsage.find(ClassID) == RegUsage.end()) 5456 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5457 else 5458 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5459 } 5460 } 5461 } 5462 5463 for (auto& pair : RegUsage) { 5464 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5465 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5466 else 5467 MaxUsages[j][pair.first] = pair.second; 5468 } 5469 } 5470 5471 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5472 << OpenIntervals.size() << '\n'); 5473 5474 // Add the current instruction to the list of open intervals. 5475 OpenIntervals.insert(I); 5476 } 5477 5478 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5479 SmallMapVector<unsigned, unsigned, 4> Invariant; 5480 5481 for (auto Inst : LoopInvariants) { 5482 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5483 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); 5484 if (Invariant.find(ClassID) == Invariant.end()) 5485 Invariant[ClassID] = Usage; 5486 else 5487 Invariant[ClassID] += Usage; 5488 } 5489 5490 LLVM_DEBUG({ 5491 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5492 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5493 << " item\n"; 5494 for (const auto &pair : MaxUsages[i]) { 5495 dbgs() << "LV(REG): RegisterClass: " 5496 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5497 << " registers\n"; 5498 } 5499 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5500 << " item\n"; 5501 for (const auto &pair : Invariant) { 5502 dbgs() << "LV(REG): RegisterClass: " 5503 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5504 << " registers\n"; 5505 } 5506 }); 5507 5508 RU.LoopInvariantRegs = Invariant; 5509 RU.MaxLocalUsers = MaxUsages[i]; 5510 RUs[i] = RU; 5511 } 5512 5513 return RUs; 5514 } 5515 5516 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5517 // TODO: Cost model for emulated masked load/store is completely 5518 // broken. This hack guides the cost model to use an artificially 5519 // high enough value to practically disable vectorization with such 5520 // operations, except where previously deployed legality hack allowed 5521 // using very low cost values. This is to avoid regressions coming simply 5522 // from moving "masked load/store" check from legality to cost model. 5523 // Masked Load/Gather emulation was previously never allowed. 5524 // Limited number of Masked Store/Scatter emulation was allowed. 5525 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5526 return isa<LoadInst>(I) || 5527 (isa<StoreInst>(I) && 5528 NumPredStores > NumberOfStoresToPredicate); 5529 } 5530 5531 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5532 // If we aren't vectorizing the loop, or if we've already collected the 5533 // instructions to scalarize, there's nothing to do. Collection may already 5534 // have occurred if we have a user-selected VF and are now computing the 5535 // expected cost for interleaving. 5536 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5537 return; 5538 5539 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5540 // not profitable to scalarize any instructions, the presence of VF in the 5541 // map will indicate that we've analyzed it already. 5542 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5543 5544 // Find all the instructions that are scalar with predication in the loop and 5545 // determine if it would be better to not if-convert the blocks they are in. 5546 // If so, we also record the instructions to scalarize. 5547 for (BasicBlock *BB : TheLoop->blocks()) { 5548 if (!blockNeedsPredication(BB)) 5549 continue; 5550 for (Instruction &I : *BB) 5551 if (isScalarWithPredication(&I)) { 5552 ScalarCostsTy ScalarCosts; 5553 // Do not apply discount logic if hacked cost is needed 5554 // for emulated masked memrefs. 5555 if (!useEmulatedMaskMemRefHack(&I) && 5556 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5557 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5558 // Remember that BB will remain after vectorization. 5559 PredicatedBBsAfterVectorization.insert(BB); 5560 } 5561 } 5562 } 5563 5564 int LoopVectorizationCostModel::computePredInstDiscount( 5565 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5566 unsigned VF) { 5567 assert(!isUniformAfterVectorization(PredInst, VF) && 5568 "Instruction marked uniform-after-vectorization will be predicated"); 5569 5570 // Initialize the discount to zero, meaning that the scalar version and the 5571 // vector version cost the same. 5572 int Discount = 0; 5573 5574 // Holds instructions to analyze. The instructions we visit are mapped in 5575 // ScalarCosts. Those instructions are the ones that would be scalarized if 5576 // we find that the scalar version costs less. 5577 SmallVector<Instruction *, 8> Worklist; 5578 5579 // Returns true if the given instruction can be scalarized. 5580 auto canBeScalarized = [&](Instruction *I) -> bool { 5581 // We only attempt to scalarize instructions forming a single-use chain 5582 // from the original predicated block that would otherwise be vectorized. 5583 // Although not strictly necessary, we give up on instructions we know will 5584 // already be scalar to avoid traversing chains that are unlikely to be 5585 // beneficial. 5586 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5587 isScalarAfterVectorization(I, VF)) 5588 return false; 5589 5590 // If the instruction is scalar with predication, it will be analyzed 5591 // separately. We ignore it within the context of PredInst. 5592 if (isScalarWithPredication(I)) 5593 return false; 5594 5595 // If any of the instruction's operands are uniform after vectorization, 5596 // the instruction cannot be scalarized. This prevents, for example, a 5597 // masked load from being scalarized. 5598 // 5599 // We assume we will only emit a value for lane zero of an instruction 5600 // marked uniform after vectorization, rather than VF identical values. 5601 // Thus, if we scalarize an instruction that uses a uniform, we would 5602 // create uses of values corresponding to the lanes we aren't emitting code 5603 // for. This behavior can be changed by allowing getScalarValue to clone 5604 // the lane zero values for uniforms rather than asserting. 5605 for (Use &U : I->operands()) 5606 if (auto *J = dyn_cast<Instruction>(U.get())) 5607 if (isUniformAfterVectorization(J, VF)) 5608 return false; 5609 5610 // Otherwise, we can scalarize the instruction. 5611 return true; 5612 }; 5613 5614 // Compute the expected cost discount from scalarizing the entire expression 5615 // feeding the predicated instruction. We currently only consider expressions 5616 // that are single-use instruction chains. 5617 Worklist.push_back(PredInst); 5618 while (!Worklist.empty()) { 5619 Instruction *I = Worklist.pop_back_val(); 5620 5621 // If we've already analyzed the instruction, there's nothing to do. 5622 if (ScalarCosts.find(I) != ScalarCosts.end()) 5623 continue; 5624 5625 // Compute the cost of the vector instruction. Note that this cost already 5626 // includes the scalarization overhead of the predicated instruction. 5627 unsigned VectorCost = getInstructionCost(I, VF).first; 5628 5629 // Compute the cost of the scalarized instruction. This cost is the cost of 5630 // the instruction as if it wasn't if-converted and instead remained in the 5631 // predicated block. We will scale this cost by block probability after 5632 // computing the scalarization overhead. 5633 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5634 5635 // Compute the scalarization overhead of needed insertelement instructions 5636 // and phi nodes. 5637 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5638 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), 5639 true, false); 5640 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); 5641 } 5642 5643 // Compute the scalarization overhead of needed extractelement 5644 // instructions. For each of the instruction's operands, if the operand can 5645 // be scalarized, add it to the worklist; otherwise, account for the 5646 // overhead. 5647 for (Use &U : I->operands()) 5648 if (auto *J = dyn_cast<Instruction>(U.get())) { 5649 assert(VectorType::isValidElementType(J->getType()) && 5650 "Instruction has non-scalar type"); 5651 if (canBeScalarized(J)) 5652 Worklist.push_back(J); 5653 else if (needsExtract(J, VF)) 5654 ScalarCost += TTI.getScalarizationOverhead( 5655 ToVectorTy(J->getType(),VF), false, true); 5656 } 5657 5658 // Scale the total scalar cost by block probability. 5659 ScalarCost /= getReciprocalPredBlockProb(); 5660 5661 // Compute the discount. A non-negative discount means the vector version 5662 // of the instruction costs more, and scalarizing would be beneficial. 5663 Discount += VectorCost - ScalarCost; 5664 ScalarCosts[I] = ScalarCost; 5665 } 5666 5667 return Discount; 5668 } 5669 5670 LoopVectorizationCostModel::VectorizationCostTy 5671 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5672 VectorizationCostTy Cost; 5673 5674 // For each block. 5675 for (BasicBlock *BB : TheLoop->blocks()) { 5676 VectorizationCostTy BlockCost; 5677 5678 // For each instruction in the old loop. 5679 for (Instruction &I : BB->instructionsWithoutDebug()) { 5680 // Skip ignored values. 5681 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || 5682 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) 5683 continue; 5684 5685 VectorizationCostTy C = getInstructionCost(&I, VF); 5686 5687 // Check if we should override the cost. 5688 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5689 C.first = ForceTargetInstructionCost; 5690 5691 BlockCost.first += C.first; 5692 BlockCost.second |= C.second; 5693 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5694 << " for VF " << VF << " For instruction: " << I 5695 << '\n'); 5696 } 5697 5698 // If we are vectorizing a predicated block, it will have been 5699 // if-converted. This means that the block's instructions (aside from 5700 // stores and instructions that may divide by zero) will now be 5701 // unconditionally executed. For the scalar case, we may not always execute 5702 // the predicated block. Thus, scale the block's cost by the probability of 5703 // executing it. 5704 if (VF == 1 && blockNeedsPredication(BB)) 5705 BlockCost.first /= getReciprocalPredBlockProb(); 5706 5707 Cost.first += BlockCost.first; 5708 Cost.second |= BlockCost.second; 5709 } 5710 5711 return Cost; 5712 } 5713 5714 /// Gets Address Access SCEV after verifying that the access pattern 5715 /// is loop invariant except the induction variable dependence. 5716 /// 5717 /// This SCEV can be sent to the Target in order to estimate the address 5718 /// calculation cost. 5719 static const SCEV *getAddressAccessSCEV( 5720 Value *Ptr, 5721 LoopVectorizationLegality *Legal, 5722 PredicatedScalarEvolution &PSE, 5723 const Loop *TheLoop) { 5724 5725 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5726 if (!Gep) 5727 return nullptr; 5728 5729 // We are looking for a gep with all loop invariant indices except for one 5730 // which should be an induction variable. 5731 auto SE = PSE.getSE(); 5732 unsigned NumOperands = Gep->getNumOperands(); 5733 for (unsigned i = 1; i < NumOperands; ++i) { 5734 Value *Opd = Gep->getOperand(i); 5735 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5736 !Legal->isInductionVariable(Opd)) 5737 return nullptr; 5738 } 5739 5740 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5741 return PSE.getSCEV(Ptr); 5742 } 5743 5744 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5745 return Legal->hasStride(I->getOperand(0)) || 5746 Legal->hasStride(I->getOperand(1)); 5747 } 5748 5749 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5750 unsigned VF) { 5751 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5752 Type *ValTy = getMemInstValueType(I); 5753 auto SE = PSE.getSE(); 5754 5755 unsigned AS = getLoadStoreAddressSpace(I); 5756 Value *Ptr = getLoadStorePointerOperand(I); 5757 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5758 5759 // Figure out whether the access is strided and get the stride value 5760 // if it's known in compile time 5761 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5762 5763 // Get the cost of the scalar memory instruction and address computation. 5764 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5765 5766 // Don't pass *I here, since it is scalar but will actually be part of a 5767 // vectorized loop where the user of it is a vectorized instruction. 5768 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5769 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5770 Alignment, AS); 5771 5772 // Get the overhead of the extractelement and insertelement instructions 5773 // we might create due to scalarization. 5774 Cost += getScalarizationOverhead(I, VF); 5775 5776 // If we have a predicated store, it may not be executed for each vector 5777 // lane. Scale the cost by the probability of executing the predicated 5778 // block. 5779 if (isPredicatedInst(I)) { 5780 Cost /= getReciprocalPredBlockProb(); 5781 5782 if (useEmulatedMaskMemRefHack(I)) 5783 // Artificially setting to a high enough value to practically disable 5784 // vectorization with such operations. 5785 Cost = 3000000; 5786 } 5787 5788 return Cost; 5789 } 5790 5791 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5792 unsigned VF) { 5793 Type *ValTy = getMemInstValueType(I); 5794 Type *VectorTy = ToVectorTy(ValTy, VF); 5795 Value *Ptr = getLoadStorePointerOperand(I); 5796 unsigned AS = getLoadStoreAddressSpace(I); 5797 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5798 5799 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5800 "Stride should be 1 or -1 for consecutive memory access"); 5801 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5802 unsigned Cost = 0; 5803 if (Legal->isMaskRequired(I)) 5804 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, 5805 Alignment ? Alignment->value() : 0, AS); 5806 else 5807 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); 5808 5809 bool Reverse = ConsecutiveStride < 0; 5810 if (Reverse) 5811 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5812 return Cost; 5813 } 5814 5815 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5816 unsigned VF) { 5817 Type *ValTy = getMemInstValueType(I); 5818 Type *VectorTy = ToVectorTy(ValTy, VF); 5819 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5820 unsigned AS = getLoadStoreAddressSpace(I); 5821 if (isa<LoadInst>(I)) { 5822 return TTI.getAddressComputationCost(ValTy) + 5823 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + 5824 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5825 } 5826 StoreInst *SI = cast<StoreInst>(I); 5827 5828 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5829 return TTI.getAddressComputationCost(ValTy) + 5830 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + 5831 (isLoopInvariantStoreValue 5832 ? 0 5833 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5834 VF - 1)); 5835 } 5836 5837 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5838 unsigned VF) { 5839 Type *ValTy = getMemInstValueType(I); 5840 Type *VectorTy = ToVectorTy(ValTy, VF); 5841 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5842 Value *Ptr = getLoadStorePointerOperand(I); 5843 5844 return TTI.getAddressComputationCost(VectorTy) + 5845 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5846 Legal->isMaskRequired(I), 5847 Alignment ? Alignment->value() : 0); 5848 } 5849 5850 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5851 unsigned VF) { 5852 Type *ValTy = getMemInstValueType(I); 5853 Type *VectorTy = ToVectorTy(ValTy, VF); 5854 unsigned AS = getLoadStoreAddressSpace(I); 5855 5856 auto Group = getInterleavedAccessGroup(I); 5857 assert(Group && "Fail to get an interleaved access group."); 5858 5859 unsigned InterleaveFactor = Group->getFactor(); 5860 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5861 5862 // Holds the indices of existing members in an interleaved load group. 5863 // An interleaved store group doesn't need this as it doesn't allow gaps. 5864 SmallVector<unsigned, 4> Indices; 5865 if (isa<LoadInst>(I)) { 5866 for (unsigned i = 0; i < InterleaveFactor; i++) 5867 if (Group->getMember(i)) 5868 Indices.push_back(i); 5869 } 5870 5871 // Calculate the cost of the whole interleaved group. 5872 bool UseMaskForGaps = 5873 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5874 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5875 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5876 Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); 5877 5878 if (Group->isReverse()) { 5879 // TODO: Add support for reversed masked interleaved access. 5880 assert(!Legal->isMaskRequired(I) && 5881 "Reverse masked interleaved access not supported."); 5882 Cost += Group->getNumMembers() * 5883 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5884 } 5885 return Cost; 5886 } 5887 5888 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5889 unsigned VF) { 5890 // Calculate scalar cost only. Vectorization cost should be ready at this 5891 // moment. 5892 if (VF == 1) { 5893 Type *ValTy = getMemInstValueType(I); 5894 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5895 unsigned AS = getLoadStoreAddressSpace(I); 5896 5897 return TTI.getAddressComputationCost(ValTy) + 5898 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); 5899 } 5900 return getWideningCost(I, VF); 5901 } 5902 5903 LoopVectorizationCostModel::VectorizationCostTy 5904 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5905 // If we know that this instruction will remain uniform, check the cost of 5906 // the scalar version. 5907 if (isUniformAfterVectorization(I, VF)) 5908 VF = 1; 5909 5910 if (VF > 1 && isProfitableToScalarize(I, VF)) 5911 return VectorizationCostTy(InstsToScalarize[VF][I], false); 5912 5913 // Forced scalars do not have any scalarization overhead. 5914 auto ForcedScalar = ForcedScalars.find(VF); 5915 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 5916 auto InstSet = ForcedScalar->second; 5917 if (InstSet.find(I) != InstSet.end()) 5918 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 5919 } 5920 5921 Type *VectorTy; 5922 unsigned C = getInstructionCost(I, VF, VectorTy); 5923 5924 bool TypeNotScalarized = 5925 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 5926 return VectorizationCostTy(C, TypeNotScalarized); 5927 } 5928 5929 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 5930 unsigned VF) { 5931 5932 if (VF == 1) 5933 return 0; 5934 5935 unsigned Cost = 0; 5936 Type *RetTy = ToVectorTy(I->getType(), VF); 5937 if (!RetTy->isVoidTy() && 5938 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 5939 Cost += TTI.getScalarizationOverhead(RetTy, true, false); 5940 5941 // Some targets keep addresses scalar. 5942 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 5943 return Cost; 5944 5945 // Some targets support efficient element stores. 5946 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 5947 return Cost; 5948 5949 // Collect operands to consider. 5950 CallInst *CI = dyn_cast<CallInst>(I); 5951 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 5952 5953 // Skip operands that do not require extraction/scalarization and do not incur 5954 // any overhead. 5955 return Cost + TTI.getOperandsScalarizationOverhead( 5956 filterExtractingOperands(Ops, VF), VF); 5957 } 5958 5959 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 5960 if (VF == 1) 5961 return; 5962 NumPredStores = 0; 5963 for (BasicBlock *BB : TheLoop->blocks()) { 5964 // For each instruction in the old loop. 5965 for (Instruction &I : *BB) { 5966 Value *Ptr = getLoadStorePointerOperand(&I); 5967 if (!Ptr) 5968 continue; 5969 5970 // TODO: We should generate better code and update the cost model for 5971 // predicated uniform stores. Today they are treated as any other 5972 // predicated store (see added test cases in 5973 // invariant-store-vectorization.ll). 5974 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 5975 NumPredStores++; 5976 5977 if (Legal->isUniform(Ptr) && 5978 // Conditional loads and stores should be scalarized and predicated. 5979 // isScalarWithPredication cannot be used here since masked 5980 // gather/scatters are not considered scalar with predication. 5981 !Legal->blockNeedsPredication(I.getParent())) { 5982 // TODO: Avoid replicating loads and stores instead of 5983 // relying on instcombine to remove them. 5984 // Load: Scalar load + broadcast 5985 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 5986 unsigned Cost = getUniformMemOpCost(&I, VF); 5987 setWideningDecision(&I, VF, CM_Scalarize, Cost); 5988 continue; 5989 } 5990 5991 // We assume that widening is the best solution when possible. 5992 if (memoryInstructionCanBeWidened(&I, VF)) { 5993 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 5994 int ConsecutiveStride = 5995 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 5996 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5997 "Expected consecutive stride."); 5998 InstWidening Decision = 5999 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6000 setWideningDecision(&I, VF, Decision, Cost); 6001 continue; 6002 } 6003 6004 // Choose between Interleaving, Gather/Scatter or Scalarization. 6005 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6006 unsigned NumAccesses = 1; 6007 if (isAccessInterleaved(&I)) { 6008 auto Group = getInterleavedAccessGroup(&I); 6009 assert(Group && "Fail to get an interleaved access group."); 6010 6011 // Make one decision for the whole group. 6012 if (getWideningDecision(&I, VF) != CM_Unknown) 6013 continue; 6014 6015 NumAccesses = Group->getNumMembers(); 6016 if (interleavedAccessCanBeWidened(&I, VF)) 6017 InterleaveCost = getInterleaveGroupCost(&I, VF); 6018 } 6019 6020 unsigned GatherScatterCost = 6021 isLegalGatherOrScatter(&I) 6022 ? getGatherScatterCost(&I, VF) * NumAccesses 6023 : std::numeric_limits<unsigned>::max(); 6024 6025 unsigned ScalarizationCost = 6026 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6027 6028 // Choose better solution for the current VF, 6029 // write down this decision and use it during vectorization. 6030 unsigned Cost; 6031 InstWidening Decision; 6032 if (InterleaveCost <= GatherScatterCost && 6033 InterleaveCost < ScalarizationCost) { 6034 Decision = CM_Interleave; 6035 Cost = InterleaveCost; 6036 } else if (GatherScatterCost < ScalarizationCost) { 6037 Decision = CM_GatherScatter; 6038 Cost = GatherScatterCost; 6039 } else { 6040 Decision = CM_Scalarize; 6041 Cost = ScalarizationCost; 6042 } 6043 // If the instructions belongs to an interleave group, the whole group 6044 // receives the same decision. The whole group receives the cost, but 6045 // the cost will actually be assigned to one instruction. 6046 if (auto Group = getInterleavedAccessGroup(&I)) 6047 setWideningDecision(Group, VF, Decision, Cost); 6048 else 6049 setWideningDecision(&I, VF, Decision, Cost); 6050 } 6051 } 6052 6053 // Make sure that any load of address and any other address computation 6054 // remains scalar unless there is gather/scatter support. This avoids 6055 // inevitable extracts into address registers, and also has the benefit of 6056 // activating LSR more, since that pass can't optimize vectorized 6057 // addresses. 6058 if (TTI.prefersVectorizedAddressing()) 6059 return; 6060 6061 // Start with all scalar pointer uses. 6062 SmallPtrSet<Instruction *, 8> AddrDefs; 6063 for (BasicBlock *BB : TheLoop->blocks()) 6064 for (Instruction &I : *BB) { 6065 Instruction *PtrDef = 6066 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6067 if (PtrDef && TheLoop->contains(PtrDef) && 6068 getWideningDecision(&I, VF) != CM_GatherScatter) 6069 AddrDefs.insert(PtrDef); 6070 } 6071 6072 // Add all instructions used to generate the addresses. 6073 SmallVector<Instruction *, 4> Worklist; 6074 for (auto *I : AddrDefs) 6075 Worklist.push_back(I); 6076 while (!Worklist.empty()) { 6077 Instruction *I = Worklist.pop_back_val(); 6078 for (auto &Op : I->operands()) 6079 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6080 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6081 AddrDefs.insert(InstOp).second) 6082 Worklist.push_back(InstOp); 6083 } 6084 6085 for (auto *I : AddrDefs) { 6086 if (isa<LoadInst>(I)) { 6087 // Setting the desired widening decision should ideally be handled in 6088 // by cost functions, but since this involves the task of finding out 6089 // if the loaded register is involved in an address computation, it is 6090 // instead changed here when we know this is the case. 6091 InstWidening Decision = getWideningDecision(I, VF); 6092 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6093 // Scalarize a widened load of address. 6094 setWideningDecision(I, VF, CM_Scalarize, 6095 (VF * getMemoryInstructionCost(I, 1))); 6096 else if (auto Group = getInterleavedAccessGroup(I)) { 6097 // Scalarize an interleave group of address loads. 6098 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6099 if (Instruction *Member = Group->getMember(I)) 6100 setWideningDecision(Member, VF, CM_Scalarize, 6101 (VF * getMemoryInstructionCost(Member, 1))); 6102 } 6103 } 6104 } else 6105 // Make sure I gets scalarized and a cost estimate without 6106 // scalarization overhead. 6107 ForcedScalars[VF].insert(I); 6108 } 6109 } 6110 6111 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6112 unsigned VF, 6113 Type *&VectorTy) { 6114 Type *RetTy = I->getType(); 6115 if (canTruncateToMinimalBitwidth(I, VF)) 6116 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6117 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6118 auto SE = PSE.getSE(); 6119 6120 // TODO: We need to estimate the cost of intrinsic calls. 6121 switch (I->getOpcode()) { 6122 case Instruction::GetElementPtr: 6123 // We mark this instruction as zero-cost because the cost of GEPs in 6124 // vectorized code depends on whether the corresponding memory instruction 6125 // is scalarized or not. Therefore, we handle GEPs with the memory 6126 // instruction cost. 6127 return 0; 6128 case Instruction::Br: { 6129 // In cases of scalarized and predicated instructions, there will be VF 6130 // predicated blocks in the vectorized loop. Each branch around these 6131 // blocks requires also an extract of its vector compare i1 element. 6132 bool ScalarPredicatedBB = false; 6133 BranchInst *BI = cast<BranchInst>(I); 6134 if (VF > 1 && BI->isConditional() && 6135 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != 6136 PredicatedBBsAfterVectorization.end() || 6137 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != 6138 PredicatedBBsAfterVectorization.end())) 6139 ScalarPredicatedBB = true; 6140 6141 if (ScalarPredicatedBB) { 6142 // Return cost for branches around scalarized and predicated blocks. 6143 Type *Vec_i1Ty = 6144 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6145 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + 6146 (TTI.getCFInstrCost(Instruction::Br) * VF)); 6147 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6148 // The back-edge branch will remain, as will all scalar branches. 6149 return TTI.getCFInstrCost(Instruction::Br); 6150 else 6151 // This branch will be eliminated by if-conversion. 6152 return 0; 6153 // Note: We currently assume zero cost for an unconditional branch inside 6154 // a predicated block since it will become a fall-through, although we 6155 // may decide in the future to call TTI for all branches. 6156 } 6157 case Instruction::PHI: { 6158 auto *Phi = cast<PHINode>(I); 6159 6160 // First-order recurrences are replaced by vector shuffles inside the loop. 6161 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6162 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6163 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6164 VectorTy, VF - 1, VectorType::get(RetTy, 1)); 6165 6166 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6167 // converted into select instructions. We require N - 1 selects per phi 6168 // node, where N is the number of incoming values. 6169 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6170 return (Phi->getNumIncomingValues() - 1) * 6171 TTI.getCmpSelInstrCost( 6172 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6173 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); 6174 6175 return TTI.getCFInstrCost(Instruction::PHI); 6176 } 6177 case Instruction::UDiv: 6178 case Instruction::SDiv: 6179 case Instruction::URem: 6180 case Instruction::SRem: 6181 // If we have a predicated instruction, it may not be executed for each 6182 // vector lane. Get the scalarization cost and scale this amount by the 6183 // probability of executing the predicated block. If the instruction is not 6184 // predicated, we fall through to the next case. 6185 if (VF > 1 && isScalarWithPredication(I)) { 6186 unsigned Cost = 0; 6187 6188 // These instructions have a non-void type, so account for the phi nodes 6189 // that we will create. This cost is likely to be zero. The phi node 6190 // cost, if any, should be scaled by the block probability because it 6191 // models a copy at the end of each predicated block. 6192 Cost += VF * TTI.getCFInstrCost(Instruction::PHI); 6193 6194 // The cost of the non-predicated instruction. 6195 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); 6196 6197 // The cost of insertelement and extractelement instructions needed for 6198 // scalarization. 6199 Cost += getScalarizationOverhead(I, VF); 6200 6201 // Scale the cost by the probability of executing the predicated blocks. 6202 // This assumes the predicated block for each vector lane is equally 6203 // likely. 6204 return Cost / getReciprocalPredBlockProb(); 6205 } 6206 LLVM_FALLTHROUGH; 6207 case Instruction::Add: 6208 case Instruction::FAdd: 6209 case Instruction::Sub: 6210 case Instruction::FSub: 6211 case Instruction::Mul: 6212 case Instruction::FMul: 6213 case Instruction::FDiv: 6214 case Instruction::FRem: 6215 case Instruction::Shl: 6216 case Instruction::LShr: 6217 case Instruction::AShr: 6218 case Instruction::And: 6219 case Instruction::Or: 6220 case Instruction::Xor: { 6221 // Since we will replace the stride by 1 the multiplication should go away. 6222 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6223 return 0; 6224 // Certain instructions can be cheaper to vectorize if they have a constant 6225 // second vector operand. One example of this are shifts on x86. 6226 Value *Op2 = I->getOperand(1); 6227 TargetTransformInfo::OperandValueProperties Op2VP; 6228 TargetTransformInfo::OperandValueKind Op2VK = 6229 TTI.getOperandInfo(Op2, Op2VP); 6230 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6231 Op2VK = TargetTransformInfo::OK_UniformValue; 6232 6233 SmallVector<const Value *, 4> Operands(I->operand_values()); 6234 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6235 return N * TTI.getArithmeticInstrCost( 6236 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6237 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6238 } 6239 case Instruction::FNeg: { 6240 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6241 return N * TTI.getArithmeticInstrCost( 6242 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6243 TargetTransformInfo::OK_AnyValue, 6244 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6245 I->getOperand(0), I); 6246 } 6247 case Instruction::Select: { 6248 SelectInst *SI = cast<SelectInst>(I); 6249 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6250 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6251 Type *CondTy = SI->getCondition()->getType(); 6252 if (!ScalarCond) 6253 CondTy = VectorType::get(CondTy, VF); 6254 6255 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); 6256 } 6257 case Instruction::ICmp: 6258 case Instruction::FCmp: { 6259 Type *ValTy = I->getOperand(0)->getType(); 6260 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6261 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6262 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6263 VectorTy = ToVectorTy(ValTy, VF); 6264 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); 6265 } 6266 case Instruction::Store: 6267 case Instruction::Load: { 6268 unsigned Width = VF; 6269 if (Width > 1) { 6270 InstWidening Decision = getWideningDecision(I, Width); 6271 assert(Decision != CM_Unknown && 6272 "CM decision should be taken at this point"); 6273 if (Decision == CM_Scalarize) 6274 Width = 1; 6275 } 6276 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6277 return getMemoryInstructionCost(I, VF); 6278 } 6279 case Instruction::ZExt: 6280 case Instruction::SExt: 6281 case Instruction::FPToUI: 6282 case Instruction::FPToSI: 6283 case Instruction::FPExt: 6284 case Instruction::PtrToInt: 6285 case Instruction::IntToPtr: 6286 case Instruction::SIToFP: 6287 case Instruction::UIToFP: 6288 case Instruction::Trunc: 6289 case Instruction::FPTrunc: 6290 case Instruction::BitCast: { 6291 // We optimize the truncation of induction variables having constant 6292 // integer steps. The cost of these truncations is the same as the scalar 6293 // operation. 6294 if (isOptimizableIVTruncate(I, VF)) { 6295 auto *Trunc = cast<TruncInst>(I); 6296 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6297 Trunc->getSrcTy(), Trunc); 6298 } 6299 6300 Type *SrcScalarTy = I->getOperand(0)->getType(); 6301 Type *SrcVecTy = 6302 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6303 if (canTruncateToMinimalBitwidth(I, VF)) { 6304 // This cast is going to be shrunk. This may remove the cast or it might 6305 // turn it into slightly different cast. For example, if MinBW == 16, 6306 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6307 // 6308 // Calculate the modified src and dest types. 6309 Type *MinVecTy = VectorTy; 6310 if (I->getOpcode() == Instruction::Trunc) { 6311 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6312 VectorTy = 6313 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6314 } else if (I->getOpcode() == Instruction::ZExt || 6315 I->getOpcode() == Instruction::SExt) { 6316 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6317 VectorTy = 6318 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6319 } 6320 } 6321 6322 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6323 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); 6324 } 6325 case Instruction::Call: { 6326 bool NeedToScalarize; 6327 CallInst *CI = cast<CallInst>(I); 6328 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6329 if (getVectorIntrinsicIDForCall(CI, TLI)) 6330 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6331 return CallCost; 6332 } 6333 default: 6334 // The cost of executing VF copies of the scalar instruction. This opcode 6335 // is unknown. Assume that it is the same as 'mul'. 6336 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + 6337 getScalarizationOverhead(I, VF); 6338 } // end of switch. 6339 } 6340 6341 char LoopVectorize::ID = 0; 6342 6343 static const char lv_name[] = "Loop Vectorization"; 6344 6345 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6346 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6347 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6348 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6349 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6350 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6351 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6352 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6353 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6354 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6355 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6356 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6357 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6358 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6359 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6360 6361 namespace llvm { 6362 6363 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6364 6365 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6366 bool VectorizeOnlyWhenForced) { 6367 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6368 } 6369 6370 } // end namespace llvm 6371 6372 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6373 // Check if the pointer operand of a load or store instruction is 6374 // consecutive. 6375 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6376 return Legal->isConsecutivePtr(Ptr); 6377 return false; 6378 } 6379 6380 void LoopVectorizationCostModel::collectValuesToIgnore() { 6381 // Ignore ephemeral values. 6382 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6383 6384 // Ignore type-promoting instructions we identified during reduction 6385 // detection. 6386 for (auto &Reduction : *Legal->getReductionVars()) { 6387 RecurrenceDescriptor &RedDes = Reduction.second; 6388 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6389 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6390 } 6391 // Ignore type-casting instructions we identified during induction 6392 // detection. 6393 for (auto &Induction : *Legal->getInductionVars()) { 6394 InductionDescriptor &IndDes = Induction.second; 6395 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6396 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6397 } 6398 } 6399 6400 // TODO: we could return a pair of values that specify the max VF and 6401 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6402 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6403 // doesn't have a cost model that can choose which plan to execute if 6404 // more than one is generated. 6405 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6406 LoopVectorizationCostModel &CM) { 6407 unsigned WidestType; 6408 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6409 return WidestVectorRegBits / WidestType; 6410 } 6411 6412 VectorizationFactor 6413 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6414 unsigned VF = UserVF; 6415 // Outer loop handling: They may require CFG and instruction level 6416 // transformations before even evaluating whether vectorization is profitable. 6417 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6418 // the vectorization pipeline. 6419 if (!OrigLoop->empty()) { 6420 // If the user doesn't provide a vectorization factor, determine a 6421 // reasonable one. 6422 if (!UserVF) { 6423 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6424 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6425 6426 // Make sure we have a VF > 1 for stress testing. 6427 if (VPlanBuildStressTest && VF < 2) { 6428 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6429 << "overriding computed VF.\n"); 6430 VF = 4; 6431 } 6432 } 6433 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6434 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6435 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6436 << " to build VPlans.\n"); 6437 buildVPlans(VF, VF); 6438 6439 // For VPlan build stress testing, we bail out after VPlan construction. 6440 if (VPlanBuildStressTest) 6441 return VectorizationFactor::Disabled(); 6442 6443 return {VF, 0}; 6444 } 6445 6446 LLVM_DEBUG( 6447 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6448 "VPlan-native path.\n"); 6449 return VectorizationFactor::Disabled(); 6450 } 6451 6452 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { 6453 assert(OrigLoop->empty() && "Inner loop expected."); 6454 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); 6455 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6456 return None; 6457 6458 // Invalidate interleave groups if all blocks of loop will be predicated. 6459 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6460 !useMaskedInterleavedAccesses(*TTI)) { 6461 LLVM_DEBUG( 6462 dbgs() 6463 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6464 "which requires masked-interleaved support.\n"); 6465 CM.InterleaveInfo.reset(); 6466 } 6467 6468 if (UserVF) { 6469 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6470 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6471 // Collect the instructions (and their associated costs) that will be more 6472 // profitable to scalarize. 6473 CM.selectUserVectorizationFactor(UserVF); 6474 buildVPlansWithVPRecipes(UserVF, UserVF); 6475 LLVM_DEBUG(printPlans(dbgs())); 6476 return {{UserVF, 0}}; 6477 } 6478 6479 unsigned MaxVF = MaybeMaxVF.getValue(); 6480 assert(MaxVF != 0 && "MaxVF is zero."); 6481 6482 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6483 // Collect Uniform and Scalar instructions after vectorization with VF. 6484 CM.collectUniformsAndScalars(VF); 6485 6486 // Collect the instructions (and their associated costs) that will be more 6487 // profitable to scalarize. 6488 if (VF > 1) 6489 CM.collectInstsToScalarize(VF); 6490 } 6491 6492 buildVPlansWithVPRecipes(1, MaxVF); 6493 LLVM_DEBUG(printPlans(dbgs())); 6494 if (MaxVF == 1) 6495 return VectorizationFactor::Disabled(); 6496 6497 // Select the optimal vectorization factor. 6498 return CM.selectVectorizationFactor(MaxVF); 6499 } 6500 6501 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6502 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6503 << '\n'); 6504 BestVF = VF; 6505 BestUF = UF; 6506 6507 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6508 return !Plan->hasVF(VF); 6509 }); 6510 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6511 } 6512 6513 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6514 DominatorTree *DT) { 6515 // Perform the actual loop transformation. 6516 6517 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6518 VPCallbackILV CallbackILV(ILV); 6519 6520 VPTransformState State{BestVF, BestUF, LI, 6521 DT, ILV.Builder, ILV.VectorLoopValueMap, 6522 &ILV, CallbackILV}; 6523 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6524 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6525 6526 //===------------------------------------------------===// 6527 // 6528 // Notice: any optimization or new instruction that go 6529 // into the code below should also be implemented in 6530 // the cost-model. 6531 // 6532 //===------------------------------------------------===// 6533 6534 // 2. Copy and widen instructions from the old loop into the new loop. 6535 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6536 VPlans.front()->execute(&State); 6537 6538 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6539 // predication, updating analyses. 6540 ILV.fixVectorizedLoop(); 6541 } 6542 6543 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6544 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6545 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6546 6547 // We create new control-flow for the vectorized loop, so the original 6548 // condition will be dead after vectorization if it's only used by the 6549 // branch. 6550 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6551 if (Cmp && Cmp->hasOneUse()) 6552 DeadInstructions.insert(Cmp); 6553 6554 // We create new "steps" for induction variable updates to which the original 6555 // induction variables map. An original update instruction will be dead if 6556 // all its users except the induction variable are dead. 6557 for (auto &Induction : *Legal->getInductionVars()) { 6558 PHINode *Ind = Induction.first; 6559 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6560 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6561 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != 6562 DeadInstructions.end(); 6563 })) 6564 DeadInstructions.insert(IndUpdate); 6565 6566 // We record as "Dead" also the type-casting instructions we had identified 6567 // during induction analysis. We don't need any handling for them in the 6568 // vectorized loop because we have proven that, under a proper runtime 6569 // test guarding the vectorized loop, the value of the phi, and the casted 6570 // value of the phi, are the same. The last instruction in this casting chain 6571 // will get its scalar/vector/widened def from the scalar/vector/widened def 6572 // of the respective phi node. Any other casts in the induction def-use chain 6573 // have no other uses outside the phi update chain, and will be ignored. 6574 InductionDescriptor &IndDes = Induction.second; 6575 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6576 DeadInstructions.insert(Casts.begin(), Casts.end()); 6577 } 6578 } 6579 6580 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6581 6582 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6583 6584 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6585 Instruction::BinaryOps BinOp) { 6586 // When unrolling and the VF is 1, we only need to add a simple scalar. 6587 Type *Ty = Val->getType(); 6588 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6589 6590 if (Ty->isFloatingPointTy()) { 6591 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6592 6593 // Floating point operations had to be 'fast' to enable the unrolling. 6594 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6595 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6596 } 6597 Constant *C = ConstantInt::get(Ty, StartIdx); 6598 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6599 } 6600 6601 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6602 SmallVector<Metadata *, 4> MDs; 6603 // Reserve first location for self reference to the LoopID metadata node. 6604 MDs.push_back(nullptr); 6605 bool IsUnrollMetadata = false; 6606 MDNode *LoopID = L->getLoopID(); 6607 if (LoopID) { 6608 // First find existing loop unrolling disable metadata. 6609 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6610 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6611 if (MD) { 6612 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6613 IsUnrollMetadata = 6614 S && S->getString().startswith("llvm.loop.unroll.disable"); 6615 } 6616 MDs.push_back(LoopID->getOperand(i)); 6617 } 6618 } 6619 6620 if (!IsUnrollMetadata) { 6621 // Add runtime unroll disable metadata. 6622 LLVMContext &Context = L->getHeader()->getContext(); 6623 SmallVector<Metadata *, 1> DisableOperands; 6624 DisableOperands.push_back( 6625 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6626 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6627 MDs.push_back(DisableNode); 6628 MDNode *NewLoopID = MDNode::get(Context, MDs); 6629 // Set operand 0 to refer to the loop id itself. 6630 NewLoopID->replaceOperandWith(0, NewLoopID); 6631 L->setLoopID(NewLoopID); 6632 } 6633 } 6634 6635 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6636 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6637 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6638 bool PredicateAtRangeStart = Predicate(Range.Start); 6639 6640 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6641 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6642 Range.End = TmpVF; 6643 break; 6644 } 6645 6646 return PredicateAtRangeStart; 6647 } 6648 6649 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6650 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6651 /// of VF's starting at a given VF and extending it as much as possible. Each 6652 /// vectorization decision can potentially shorten this sub-range during 6653 /// buildVPlan(). 6654 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6655 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6656 VFRange SubRange = {VF, MaxVF + 1}; 6657 VPlans.push_back(buildVPlan(SubRange)); 6658 VF = SubRange.End; 6659 } 6660 } 6661 6662 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6663 VPlanPtr &Plan) { 6664 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6665 6666 // Look for cached value. 6667 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6668 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6669 if (ECEntryIt != EdgeMaskCache.end()) 6670 return ECEntryIt->second; 6671 6672 VPValue *SrcMask = createBlockInMask(Src, Plan); 6673 6674 // The terminator has to be a branch inst! 6675 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6676 assert(BI && "Unexpected terminator found"); 6677 6678 if (!BI->isConditional()) 6679 return EdgeMaskCache[Edge] = SrcMask; 6680 6681 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6682 assert(EdgeMask && "No Edge Mask found for condition"); 6683 6684 if (BI->getSuccessor(0) != Dst) 6685 EdgeMask = Builder.createNot(EdgeMask); 6686 6687 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6688 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6689 6690 return EdgeMaskCache[Edge] = EdgeMask; 6691 } 6692 6693 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6694 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6695 6696 // Look for cached value. 6697 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6698 if (BCEntryIt != BlockMaskCache.end()) 6699 return BCEntryIt->second; 6700 6701 // All-one mask is modelled as no-mask following the convention for masked 6702 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6703 VPValue *BlockMask = nullptr; 6704 6705 if (OrigLoop->getHeader() == BB) { 6706 if (!CM.blockNeedsPredication(BB)) 6707 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6708 6709 // Introduce the early-exit compare IV <= BTC to form header block mask. 6710 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6711 VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6712 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6713 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6714 return BlockMaskCache[BB] = BlockMask; 6715 } 6716 6717 // This is the block mask. We OR all incoming edges. 6718 for (auto *Predecessor : predecessors(BB)) { 6719 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6720 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6721 return BlockMaskCache[BB] = EdgeMask; 6722 6723 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6724 BlockMask = EdgeMask; 6725 continue; 6726 } 6727 6728 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6729 } 6730 6731 return BlockMaskCache[BB] = BlockMask; 6732 } 6733 6734 VPWidenMemoryInstructionRecipe * 6735 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6736 VPlanPtr &Plan) { 6737 if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) 6738 return nullptr; 6739 6740 auto willWiden = [&](unsigned VF) -> bool { 6741 if (VF == 1) 6742 return false; 6743 LoopVectorizationCostModel::InstWidening Decision = 6744 CM.getWideningDecision(I, VF); 6745 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6746 "CM decision should be taken at this point."); 6747 if (Decision == LoopVectorizationCostModel::CM_Interleave) 6748 return true; 6749 if (CM.isScalarAfterVectorization(I, VF) || 6750 CM.isProfitableToScalarize(I, VF)) 6751 return false; 6752 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6753 }; 6754 6755 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6756 return nullptr; 6757 6758 VPValue *Mask = nullptr; 6759 if (Legal->isMaskRequired(I)) 6760 Mask = createBlockInMask(I->getParent(), Plan); 6761 6762 return new VPWidenMemoryInstructionRecipe(*I, Mask); 6763 } 6764 6765 VPWidenIntOrFpInductionRecipe * 6766 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { 6767 if (PHINode *Phi = dyn_cast<PHINode>(I)) { 6768 // Check if this is an integer or fp induction. If so, build the recipe that 6769 // produces its scalar and vector values. 6770 InductionDescriptor II = Legal->getInductionVars()->lookup(Phi); 6771 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6772 II.getKind() == InductionDescriptor::IK_FpInduction) 6773 return new VPWidenIntOrFpInductionRecipe(Phi); 6774 6775 return nullptr; 6776 } 6777 6778 // Optimize the special case where the source is a constant integer 6779 // induction variable. Notice that we can only optimize the 'trunc' case 6780 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6781 // (c) other casts depend on pointer size. 6782 6783 // Determine whether \p K is a truncation based on an induction variable that 6784 // can be optimized. 6785 auto isOptimizableIVTruncate = 6786 [&](Instruction *K) -> std::function<bool(unsigned)> { 6787 return 6788 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6789 }; 6790 6791 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( 6792 isOptimizableIVTruncate(I), Range)) 6793 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6794 cast<TruncInst>(I)); 6795 return nullptr; 6796 } 6797 6798 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { 6799 PHINode *Phi = dyn_cast<PHINode>(I); 6800 if (!Phi || Phi->getParent() == OrigLoop->getHeader()) 6801 return nullptr; 6802 6803 // We know that all PHIs in non-header blocks are converted into selects, so 6804 // we don't have to worry about the insertion order and we can just use the 6805 // builder. At this point we generate the predication tree. There may be 6806 // duplications since this is a simple recursive scan, but future 6807 // optimizations will clean it up. 6808 6809 SmallVector<VPValue *, 2> Masks; 6810 unsigned NumIncoming = Phi->getNumIncomingValues(); 6811 for (unsigned In = 0; In < NumIncoming; In++) { 6812 VPValue *EdgeMask = 6813 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6814 assert((EdgeMask || NumIncoming == 1) && 6815 "Multiple predecessors with one having a full mask"); 6816 if (EdgeMask) 6817 Masks.push_back(EdgeMask); 6818 } 6819 return new VPBlendRecipe(Phi, Masks); 6820 } 6821 6822 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, 6823 VFRange &Range) { 6824 6825 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6826 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6827 6828 if (IsPredicated) 6829 return false; 6830 6831 auto IsVectorizableOpcode = [](unsigned Opcode) { 6832 switch (Opcode) { 6833 case Instruction::Add: 6834 case Instruction::And: 6835 case Instruction::AShr: 6836 case Instruction::BitCast: 6837 case Instruction::Br: 6838 case Instruction::Call: 6839 case Instruction::FAdd: 6840 case Instruction::FCmp: 6841 case Instruction::FDiv: 6842 case Instruction::FMul: 6843 case Instruction::FNeg: 6844 case Instruction::FPExt: 6845 case Instruction::FPToSI: 6846 case Instruction::FPToUI: 6847 case Instruction::FPTrunc: 6848 case Instruction::FRem: 6849 case Instruction::FSub: 6850 case Instruction::ICmp: 6851 case Instruction::IntToPtr: 6852 case Instruction::Load: 6853 case Instruction::LShr: 6854 case Instruction::Mul: 6855 case Instruction::Or: 6856 case Instruction::PHI: 6857 case Instruction::PtrToInt: 6858 case Instruction::SDiv: 6859 case Instruction::Select: 6860 case Instruction::SExt: 6861 case Instruction::Shl: 6862 case Instruction::SIToFP: 6863 case Instruction::SRem: 6864 case Instruction::Store: 6865 case Instruction::Sub: 6866 case Instruction::Trunc: 6867 case Instruction::UDiv: 6868 case Instruction::UIToFP: 6869 case Instruction::URem: 6870 case Instruction::Xor: 6871 case Instruction::ZExt: 6872 return true; 6873 } 6874 return false; 6875 }; 6876 6877 if (!IsVectorizableOpcode(I->getOpcode())) 6878 return false; 6879 6880 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6881 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6882 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6883 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6884 return false; 6885 } 6886 6887 auto willWiden = [&](unsigned VF) -> bool { 6888 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || 6889 CM.isProfitableToScalarize(I, VF))) 6890 return false; 6891 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6892 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6893 // The following case may be scalarized depending on the VF. 6894 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6895 // version of the instruction. 6896 // Is it beneficial to perform intrinsic call compared to lib call? 6897 bool NeedToScalarize; 6898 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6899 bool UseVectorIntrinsic = 6900 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6901 return UseVectorIntrinsic || !NeedToScalarize; 6902 } 6903 if (isa<LoadInst>(I) || isa<StoreInst>(I)) { 6904 assert(CM.getWideningDecision(I, VF) == 6905 LoopVectorizationCostModel::CM_Scalarize && 6906 "Memory widening decisions should have been taken care by now"); 6907 return false; 6908 } 6909 return true; 6910 }; 6911 6912 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6913 return false; 6914 // If this ingredient's recipe is to be recorded, keep its recipe a singleton 6915 // to avoid having to split recipes later. 6916 bool IsSingleton = Ingredient2Recipe.count(I); 6917 6918 // Success: widen this instruction. 6919 6920 // Use the default widening recipe. We optimize the common case where 6921 // consecutive instructions can be represented by a single recipe. 6922 if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() && 6923 LastExtensibleRecipe->appendInstruction(I)) 6924 return true; 6925 6926 VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I); 6927 if (!IsSingleton) 6928 LastExtensibleRecipe = WidenRecipe; 6929 setRecipe(I, WidenRecipe); 6930 VPBB->appendRecipe(WidenRecipe); 6931 return true; 6932 } 6933 6934 VPBasicBlock *VPRecipeBuilder::handleReplication( 6935 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 6936 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 6937 VPlanPtr &Plan) { 6938 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 6939 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 6940 Range); 6941 6942 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6943 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6944 6945 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); 6946 setRecipe(I, Recipe); 6947 6948 // Find if I uses a predicated instruction. If so, it will use its scalar 6949 // value. Avoid hoisting the insert-element which packs the scalar value into 6950 // a vector value, as that happens iff all users use the vector value. 6951 for (auto &Op : I->operands()) 6952 if (auto *PredInst = dyn_cast<Instruction>(Op)) 6953 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 6954 PredInst2Recipe[PredInst]->setAlsoPack(false); 6955 6956 // Finalize the recipe for Instr, first if it is not predicated. 6957 if (!IsPredicated) { 6958 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 6959 VPBB->appendRecipe(Recipe); 6960 return VPBB; 6961 } 6962 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 6963 assert(VPBB->getSuccessors().empty() && 6964 "VPBB has successors when handling predicated replication."); 6965 // Record predicated instructions for above packing optimizations. 6966 PredInst2Recipe[I] = Recipe; 6967 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 6968 VPBlockUtils::insertBlockAfter(Region, VPBB); 6969 auto *RegSucc = new VPBasicBlock(); 6970 VPBlockUtils::insertBlockAfter(RegSucc, Region); 6971 return RegSucc; 6972 } 6973 6974 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 6975 VPRecipeBase *PredRecipe, 6976 VPlanPtr &Plan) { 6977 // Instructions marked for predication are replicated and placed under an 6978 // if-then construct to prevent side-effects. 6979 6980 // Generate recipes to compute the block mask for this region. 6981 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 6982 6983 // Build the triangular if-then region. 6984 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 6985 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 6986 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 6987 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 6988 auto *PHIRecipe = 6989 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 6990 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 6991 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 6992 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 6993 6994 // Note: first set Entry as region entry and then connect successors starting 6995 // from it in order, to propagate the "parent" of each VPBasicBlock. 6996 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 6997 VPBlockUtils::connectBlocks(Pred, Exit); 6998 6999 return Region; 7000 } 7001 7002 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, 7003 VPlanPtr &Plan, VPBasicBlock *VPBB) { 7004 VPRecipeBase *Recipe = nullptr; 7005 7006 // First, check for specific widening recipes that deal with memory 7007 // operations, inductions and Phi nodes. 7008 if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) || 7009 (Recipe = tryToOptimizeInduction(Instr, Range)) || 7010 (Recipe = tryToBlend(Instr, Plan)) || 7011 (isa<PHINode>(Instr) && 7012 (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) { 7013 setRecipe(Instr, Recipe); 7014 VPBB->appendRecipe(Recipe); 7015 return true; 7016 } 7017 7018 // Handle GEP widening. 7019 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { 7020 auto Scalarize = [&](unsigned VF) { 7021 return CM.isScalarWithPredication(Instr, VF) || 7022 CM.isScalarAfterVectorization(Instr, VF) || 7023 CM.isProfitableToScalarize(Instr, VF); 7024 }; 7025 if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range)) 7026 return false; 7027 VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop); 7028 setRecipe(Instr, Recipe); 7029 VPBB->appendRecipe(Recipe); 7030 return true; 7031 } 7032 7033 // Check if Instr is to be widened by a general VPWidenRecipe, after 7034 // having first checked for specific widening recipes. 7035 if (tryToWiden(Instr, VPBB, Range)) 7036 return true; 7037 7038 return false; 7039 } 7040 7041 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7042 unsigned MaxVF) { 7043 assert(OrigLoop->empty() && "Inner loop expected."); 7044 7045 // Collect conditions feeding internal conditional branches; they need to be 7046 // represented in VPlan for it to model masking. 7047 SmallPtrSet<Value *, 1> NeedDef; 7048 7049 auto *Latch = OrigLoop->getLoopLatch(); 7050 for (BasicBlock *BB : OrigLoop->blocks()) { 7051 if (BB == Latch) 7052 continue; 7053 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7054 if (Branch && Branch->isConditional()) 7055 NeedDef.insert(Branch->getCondition()); 7056 } 7057 7058 // If the tail is to be folded by masking, the primary induction variable 7059 // needs to be represented in VPlan for it to model early-exit masking. 7060 // Also, both the Phi and the live-out instruction of each reduction are 7061 // required in order to introduce a select between them in VPlan. 7062 if (CM.foldTailByMasking()) { 7063 NeedDef.insert(Legal->getPrimaryInduction()); 7064 for (auto &Reduction : *Legal->getReductionVars()) { 7065 NeedDef.insert(Reduction.first); 7066 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7067 } 7068 } 7069 7070 // Collect instructions from the original loop that will become trivially dead 7071 // in the vectorized loop. We don't need to vectorize these instructions. For 7072 // example, original induction update instructions can become dead because we 7073 // separately emit induction "steps" when generating code for the new loop. 7074 // Similarly, we create a new latch condition when setting up the structure 7075 // of the new loop, so the old one can become dead. 7076 SmallPtrSet<Instruction *, 4> DeadInstructions; 7077 collectTriviallyDeadInstructions(DeadInstructions); 7078 7079 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7080 VFRange SubRange = {VF, MaxVF + 1}; 7081 VPlans.push_back( 7082 buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions)); 7083 VF = SubRange.End; 7084 } 7085 } 7086 7087 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7088 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7089 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 7090 7091 // Hold a mapping from predicated instructions to their recipes, in order to 7092 // fix their AlsoPack behavior if a user is determined to replicate and use a 7093 // scalar instead of vector value. 7094 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7095 7096 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7097 7098 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7099 7100 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); 7101 7102 // --------------------------------------------------------------------------- 7103 // Pre-construction: record ingredients whose recipes we'll need to further 7104 // process after constructing the initial VPlan. 7105 // --------------------------------------------------------------------------- 7106 7107 // Mark instructions we'll need to sink later and their targets as 7108 // ingredients whose recipe we'll need to record. 7109 for (auto &Entry : SinkAfter) { 7110 RecipeBuilder.recordRecipeOf(Entry.first); 7111 RecipeBuilder.recordRecipeOf(Entry.second); 7112 } 7113 7114 // For each interleave group which is relevant for this (possibly trimmed) 7115 // Range, add it to the set of groups to be later applied to the VPlan and add 7116 // placeholders for its members' Recipes which we'll be replacing with a 7117 // single VPInterleaveRecipe. 7118 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7119 auto applyIG = [IG, this](unsigned VF) -> bool { 7120 return (VF >= 2 && // Query is illegal for VF == 1 7121 CM.getWideningDecision(IG->getInsertPos(), VF) == 7122 LoopVectorizationCostModel::CM_Interleave); 7123 }; 7124 if (!getDecisionAndClampRange(applyIG, Range)) 7125 continue; 7126 InterleaveGroups.insert(IG); 7127 for (unsigned i = 0; i < IG->getFactor(); i++) 7128 if (Instruction *Member = IG->getMember(i)) 7129 RecipeBuilder.recordRecipeOf(Member); 7130 }; 7131 7132 // --------------------------------------------------------------------------- 7133 // Build initial VPlan: Scan the body of the loop in a topological order to 7134 // visit each basic block after having visited its predecessor basic blocks. 7135 // --------------------------------------------------------------------------- 7136 7137 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7138 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7139 auto Plan = std::make_unique<VPlan>(VPBB); 7140 7141 // Represent values that will have defs inside VPlan. 7142 for (Value *V : NeedDef) 7143 Plan->addVPValue(V); 7144 7145 // Scan the body of the loop in a topological order to visit each basic block 7146 // after having visited its predecessor basic blocks. 7147 LoopBlocksDFS DFS(OrigLoop); 7148 DFS.perform(LI); 7149 7150 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7151 // Relevant instructions from basic block BB will be grouped into VPRecipe 7152 // ingredients and fill a new VPBasicBlock. 7153 unsigned VPBBsForBB = 0; 7154 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7155 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7156 VPBB = FirstVPBBForBB; 7157 Builder.setInsertPoint(VPBB); 7158 7159 // Introduce each ingredient into VPlan. 7160 for (Instruction &I : BB->instructionsWithoutDebug()) { 7161 Instruction *Instr = &I; 7162 7163 // First filter out irrelevant instructions, to ensure no recipes are 7164 // built for them. 7165 if (isa<BranchInst>(Instr) || 7166 DeadInstructions.find(Instr) != DeadInstructions.end()) 7167 continue; 7168 7169 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) 7170 continue; 7171 7172 // Otherwise, if all widening options failed, Instruction is to be 7173 // replicated. This may create a successor for VPBB. 7174 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7175 Instr, Range, VPBB, PredInst2Recipe, Plan); 7176 if (NextVPBB != VPBB) { 7177 VPBB = NextVPBB; 7178 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7179 : ""); 7180 } 7181 } 7182 } 7183 7184 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7185 // may also be empty, such as the last one VPBB, reflecting original 7186 // basic-blocks with no recipes. 7187 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7188 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7189 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7190 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7191 delete PreEntry; 7192 7193 // --------------------------------------------------------------------------- 7194 // Transform initial VPlan: Apply previously taken decisions, in order, to 7195 // bring the VPlan to its final state. 7196 // --------------------------------------------------------------------------- 7197 7198 // Apply Sink-After legal constraints. 7199 for (auto &Entry : SinkAfter) { 7200 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7201 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7202 Sink->moveAfter(Target); 7203 } 7204 7205 // Interleave memory: for each Interleave Group we marked earlier as relevant 7206 // for this VPlan, replace the Recipes widening its memory instructions with a 7207 // single VPInterleaveRecipe at its insertion point. 7208 for (auto IG : InterleaveGroups) { 7209 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7210 RecipeBuilder.getRecipe(IG->getInsertPos())); 7211 (new VPInterleaveRecipe(IG, Recipe->getMask()))->insertBefore(Recipe); 7212 7213 for (unsigned i = 0; i < IG->getFactor(); ++i) 7214 if (Instruction *Member = IG->getMember(i)) { 7215 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7216 } 7217 } 7218 7219 // Finally, if tail is folded by masking, introduce selects between the phi 7220 // and the live-out instruction of each reduction, at the end of the latch. 7221 if (CM.foldTailByMasking()) { 7222 Builder.setInsertPoint(VPBB); 7223 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7224 for (auto &Reduction : *Legal->getReductionVars()) { 7225 VPValue *Phi = Plan->getVPValue(Reduction.first); 7226 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7227 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7228 } 7229 } 7230 7231 std::string PlanName; 7232 raw_string_ostream RSO(PlanName); 7233 unsigned VF = Range.Start; 7234 Plan->addVF(VF); 7235 RSO << "Initial VPlan for VF={" << VF; 7236 for (VF *= 2; VF < Range.End; VF *= 2) { 7237 Plan->addVF(VF); 7238 RSO << "," << VF; 7239 } 7240 RSO << "},UF>=1"; 7241 RSO.flush(); 7242 Plan->setName(PlanName); 7243 7244 return Plan; 7245 } 7246 7247 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7248 // Outer loop handling: They may require CFG and instruction level 7249 // transformations before even evaluating whether vectorization is profitable. 7250 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7251 // the vectorization pipeline. 7252 assert(!OrigLoop->empty()); 7253 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7254 7255 // Create new empty VPlan 7256 auto Plan = std::make_unique<VPlan>(); 7257 7258 // Build hierarchical CFG 7259 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7260 HCFGBuilder.buildHierarchicalCFG(); 7261 7262 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7263 Plan->addVF(VF); 7264 7265 if (EnableVPlanPredication) { 7266 VPlanPredicator VPP(*Plan); 7267 VPP.predicate(); 7268 7269 // Avoid running transformation to recipes until masked code generation in 7270 // VPlan-native path is in place. 7271 return Plan; 7272 } 7273 7274 SmallPtrSet<Instruction *, 1> DeadInstructions; 7275 VPlanTransforms::VPInstructionsToVPRecipes( 7276 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7277 return Plan; 7278 } 7279 7280 Value* LoopVectorizationPlanner::VPCallbackILV:: 7281 getOrCreateVectorValues(Value *V, unsigned Part) { 7282 return ILV.getOrCreateVectorValue(V, Part); 7283 } 7284 7285 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { 7286 O << " +\n" 7287 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7288 IG->getInsertPos()->printAsOperand(O, false); 7289 if (User) { 7290 O << ", "; 7291 User->getOperand(0)->printAsOperand(O); 7292 } 7293 O << "\\l\""; 7294 for (unsigned i = 0; i < IG->getFactor(); ++i) 7295 if (Instruction *I = IG->getMember(i)) 7296 O << " +\n" 7297 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; 7298 } 7299 7300 void VPWidenRecipe::execute(VPTransformState &State) { 7301 for (auto &Instr : make_range(Begin, End)) 7302 State.ILV->widenInstruction(Instr); 7303 } 7304 7305 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7306 State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, 7307 IsIndexLoopInvariant); 7308 } 7309 7310 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7311 assert(!State.Instance && "Int or FP induction being replicated."); 7312 State.ILV->widenIntOrFpInduction(IV, Trunc); 7313 } 7314 7315 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7316 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7317 } 7318 7319 void VPBlendRecipe::execute(VPTransformState &State) { 7320 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7321 // We know that all PHIs in non-header blocks are converted into 7322 // selects, so we don't have to worry about the insertion order and we 7323 // can just use the builder. 7324 // At this point we generate the predication tree. There may be 7325 // duplications since this is a simple recursive scan, but future 7326 // optimizations will clean it up. 7327 7328 unsigned NumIncoming = Phi->getNumIncomingValues(); 7329 7330 assert((User || NumIncoming == 1) && 7331 "Multiple predecessors with predecessors having a full mask"); 7332 // Generate a sequence of selects of the form: 7333 // SELECT(Mask3, In3, 7334 // SELECT(Mask2, In2, 7335 // ( ...))) 7336 InnerLoopVectorizer::VectorParts Entry(State.UF); 7337 for (unsigned In = 0; In < NumIncoming; ++In) { 7338 for (unsigned Part = 0; Part < State.UF; ++Part) { 7339 // We might have single edge PHIs (blocks) - use an identity 7340 // 'select' for the first PHI operand. 7341 Value *In0 = 7342 State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part); 7343 if (In == 0) 7344 Entry[Part] = In0; // Initialize with the first incoming value. 7345 else { 7346 // Select between the current value and the previous incoming edge 7347 // based on the incoming mask. 7348 Value *Cond = State.get(User->getOperand(In), Part); 7349 Entry[Part] = 7350 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7351 } 7352 } 7353 } 7354 for (unsigned Part = 0; Part < State.UF; ++Part) 7355 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7356 } 7357 7358 void VPInterleaveRecipe::execute(VPTransformState &State) { 7359 assert(!State.Instance && "Interleave group being replicated."); 7360 if (!User) 7361 return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); 7362 7363 // Last (and currently only) operand is a mask. 7364 InnerLoopVectorizer::VectorParts MaskValues(State.UF); 7365 VPValue *Mask = User->getOperand(User->getNumOperands() - 1); 7366 for (unsigned Part = 0; Part < State.UF; ++Part) 7367 MaskValues[Part] = State.get(Mask, Part); 7368 State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues); 7369 } 7370 7371 void VPReplicateRecipe::execute(VPTransformState &State) { 7372 if (State.Instance) { // Generate a single instance. 7373 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); 7374 // Insert scalar instance packing it into a vector. 7375 if (AlsoPack && State.VF > 1) { 7376 // If we're constructing lane 0, initialize to start from undef. 7377 if (State.Instance->Lane == 0) { 7378 Value *Undef = 7379 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7380 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7381 } 7382 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7383 } 7384 return; 7385 } 7386 7387 // Generate scalar instances for all VF lanes of all UF parts, unless the 7388 // instruction is uniform inwhich case generate only the first lane for each 7389 // of the UF parts. 7390 unsigned EndLane = IsUniform ? 1 : State.VF; 7391 for (unsigned Part = 0; Part < State.UF; ++Part) 7392 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7393 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); 7394 } 7395 7396 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7397 assert(State.Instance && "Branch on Mask works only on single instance."); 7398 7399 unsigned Part = State.Instance->Part; 7400 unsigned Lane = State.Instance->Lane; 7401 7402 Value *ConditionBit = nullptr; 7403 if (!User) // Block in mask is all-one. 7404 ConditionBit = State.Builder.getTrue(); 7405 else { 7406 VPValue *BlockInMask = User->getOperand(0); 7407 ConditionBit = State.get(BlockInMask, Part); 7408 if (ConditionBit->getType()->isVectorTy()) 7409 ConditionBit = State.Builder.CreateExtractElement( 7410 ConditionBit, State.Builder.getInt32(Lane)); 7411 } 7412 7413 // Replace the temporary unreachable terminator with a new conditional branch, 7414 // whose two destinations will be set later when they are created. 7415 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7416 assert(isa<UnreachableInst>(CurrentTerminator) && 7417 "Expected to replace unreachable terminator with conditional branch."); 7418 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7419 CondBr->setSuccessor(0, nullptr); 7420 ReplaceInstWithInst(CurrentTerminator, CondBr); 7421 } 7422 7423 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7424 assert(State.Instance && "Predicated instruction PHI works per instance."); 7425 Instruction *ScalarPredInst = cast<Instruction>( 7426 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7427 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7428 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7429 assert(PredicatingBB && "Predicated block has no single predecessor."); 7430 7431 // By current pack/unpack logic we need to generate only a single phi node: if 7432 // a vector value for the predicated instruction exists at this point it means 7433 // the instruction has vector users only, and a phi for the vector value is 7434 // needed. In this case the recipe of the predicated instruction is marked to 7435 // also do that packing, thereby "hoisting" the insert-element sequence. 7436 // Otherwise, a phi node for the scalar value is needed. 7437 unsigned Part = State.Instance->Part; 7438 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7439 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7440 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7441 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7442 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7443 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7444 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7445 } else { 7446 Type *PredInstType = PredInst->getType(); 7447 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7448 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7449 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7450 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7451 } 7452 } 7453 7454 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7455 VPValue *Mask = getMask(); 7456 if (!Mask) 7457 return State.ILV->vectorizeMemoryInstruction(&Instr); 7458 7459 InnerLoopVectorizer::VectorParts MaskValues(State.UF); 7460 for (unsigned Part = 0; Part < State.UF; ++Part) 7461 MaskValues[Part] = State.get(Mask, Part); 7462 State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); 7463 } 7464 7465 static ScalarEpilogueLowering 7466 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, 7467 ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, 7468 TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7469 AssumptionCache *AC, LoopInfo *LI, 7470 ScalarEvolution *SE, DominatorTree *DT, 7471 const LoopAccessInfo *LAI) { 7472 ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; 7473 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 7474 !PreferPredicateOverEpilog; 7475 7476 if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && 7477 (F->hasOptSize() || 7478 llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 7479 PGSOQueryType::IRPass))) 7480 SEL = CM_ScalarEpilogueNotAllowedOptSize; 7481 else if (PreferPredicateOverEpilog || 7482 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 7483 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, LAI) && 7484 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled && 7485 !PredicateOptDisabled)) 7486 SEL = CM_ScalarEpilogueNotNeededUsePredicate; 7487 7488 return SEL; 7489 } 7490 7491 // Process the loop in the VPlan-native vectorization path. This path builds 7492 // VPlan upfront in the vectorization pipeline, which allows to apply 7493 // VPlan-to-VPlan transformations from the very beginning without modifying the 7494 // input LLVM IR. 7495 static bool processLoopInVPlanNativePath( 7496 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7497 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7498 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7499 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7500 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7501 7502 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7503 Function *F = L->getHeader()->getParent(); 7504 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7505 7506 ScalarEpilogueLowering SEL = 7507 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, 7508 PSE.getSE(), DT, LVL->getLAI()); 7509 7510 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7511 &Hints, IAI); 7512 // Use the planner for outer loop vectorization. 7513 // TODO: CM is not used at this point inside the planner. Turn CM into an 7514 // optional argument if we don't need it in the future. 7515 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI); 7516 7517 // Get user vectorization factor. 7518 const unsigned UserVF = Hints.getWidth(); 7519 7520 // Plan how to best vectorize, return the best VF and its cost. 7521 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7522 7523 // If we are stress testing VPlan builds, do not attempt to generate vector 7524 // code. Masked vector code generation support will follow soon. 7525 // Also, do not attempt to vectorize if no vector code will be produced. 7526 if (VPlanBuildStressTest || EnableVPlanPredication || 7527 VectorizationFactor::Disabled() == VF) 7528 return false; 7529 7530 LVP.setBestPlan(VF.Width, 1); 7531 7532 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7533 &CM); 7534 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7535 << L->getHeader()->getParent()->getName() << "\"\n"); 7536 LVP.executePlan(LB, DT); 7537 7538 // Mark the loop as already vectorized to avoid vectorizing again. 7539 Hints.setAlreadyVectorized(); 7540 7541 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7542 return true; 7543 } 7544 7545 bool LoopVectorizePass::processLoop(Loop *L) { 7546 assert((EnableVPlanNativePath || L->empty()) && 7547 "VPlan-native path is not enabled. Only process inner loops."); 7548 7549 #ifndef NDEBUG 7550 const std::string DebugLocStr = getDebugLocString(L); 7551 #endif /* NDEBUG */ 7552 7553 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7554 << L->getHeader()->getParent()->getName() << "\" from " 7555 << DebugLocStr << "\n"); 7556 7557 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7558 7559 LLVM_DEBUG( 7560 dbgs() << "LV: Loop hints:" 7561 << " force=" 7562 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7563 ? "disabled" 7564 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7565 ? "enabled" 7566 : "?")) 7567 << " width=" << Hints.getWidth() 7568 << " unroll=" << Hints.getInterleave() << "\n"); 7569 7570 // Function containing loop 7571 Function *F = L->getHeader()->getParent(); 7572 7573 // Looking at the diagnostic output is the only way to determine if a loop 7574 // was vectorized (other than looking at the IR or machine code), so it 7575 // is important to generate an optimization remark for each loop. Most of 7576 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7577 // generated as OptimizationRemark and OptimizationRemarkMissed are 7578 // less verbose reporting vectorized loops and unvectorized loops that may 7579 // benefit from vectorization, respectively. 7580 7581 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7582 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7583 return false; 7584 } 7585 7586 PredicatedScalarEvolution PSE(*SE, *L); 7587 7588 // Check if it is legal to vectorize the loop. 7589 LoopVectorizationRequirements Requirements(*ORE); 7590 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7591 &Requirements, &Hints, DB, AC); 7592 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7593 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7594 Hints.emitRemarkWithHints(); 7595 return false; 7596 } 7597 7598 // Check the function attributes and profiles to find out if this function 7599 // should be optimized for size. 7600 ScalarEpilogueLowering SEL = 7601 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, 7602 PSE.getSE(), DT, LVL.getLAI()); 7603 7604 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7605 // here. They may require CFG and instruction level transformations before 7606 // even evaluating whether vectorization is profitable. Since we cannot modify 7607 // the incoming IR, we need to build VPlan upfront in the vectorization 7608 // pipeline. 7609 if (!L->empty()) 7610 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7611 ORE, BFI, PSI, Hints); 7612 7613 assert(L->empty() && "Inner loop expected."); 7614 7615 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7616 // count by optimizing for size, to minimize overheads. 7617 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 7618 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 7619 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7620 << "This loop is worth vectorizing only if no scalar " 7621 << "iteration overheads are incurred."); 7622 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7623 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7624 else { 7625 LLVM_DEBUG(dbgs() << "\n"); 7626 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7627 } 7628 } 7629 7630 // Check the function attributes to see if implicit floats are allowed. 7631 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7632 // an integer loop and the vector instructions selected are purely integer 7633 // vector instructions? 7634 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7635 reportVectorizationFailure( 7636 "Can't vectorize when the NoImplicitFloat attribute is used", 7637 "loop not vectorized due to NoImplicitFloat attribute", 7638 "NoImplicitFloat", ORE, L); 7639 Hints.emitRemarkWithHints(); 7640 return false; 7641 } 7642 7643 // Check if the target supports potentially unsafe FP vectorization. 7644 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7645 // for the target we're vectorizing for, to make sure none of the 7646 // additional fp-math flags can help. 7647 if (Hints.isPotentiallyUnsafe() && 7648 TTI->isFPVectorizationPotentiallyUnsafe()) { 7649 reportVectorizationFailure( 7650 "Potentially unsafe FP op prevents vectorization", 7651 "loop not vectorized due to unsafe FP support.", 7652 "UnsafeFP", ORE, L); 7653 Hints.emitRemarkWithHints(); 7654 return false; 7655 } 7656 7657 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7658 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7659 7660 // If an override option has been passed in for interleaved accesses, use it. 7661 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7662 UseInterleaved = EnableInterleavedMemAccesses; 7663 7664 // Analyze interleaved memory accesses. 7665 if (UseInterleaved) { 7666 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7667 } 7668 7669 // Use the cost model. 7670 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7671 F, &Hints, IAI); 7672 CM.collectValuesToIgnore(); 7673 7674 // Use the planner for vectorization. 7675 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI); 7676 7677 // Get user vectorization factor. 7678 unsigned UserVF = Hints.getWidth(); 7679 7680 // Plan how to best vectorize, return the best VF and its cost. 7681 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); 7682 7683 VectorizationFactor VF = VectorizationFactor::Disabled(); 7684 unsigned IC = 1; 7685 unsigned UserIC = Hints.getInterleave(); 7686 7687 if (MaybeVF) { 7688 VF = *MaybeVF; 7689 // Select the interleave count. 7690 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7691 } 7692 7693 // Identify the diagnostic messages that should be produced. 7694 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7695 bool VectorizeLoop = true, InterleaveLoop = true; 7696 if (Requirements.doesNotMeet(F, L, Hints)) { 7697 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7698 "requirements.\n"); 7699 Hints.emitRemarkWithHints(); 7700 return false; 7701 } 7702 7703 if (VF.Width == 1) { 7704 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7705 VecDiagMsg = std::make_pair( 7706 "VectorizationNotBeneficial", 7707 "the cost-model indicates that vectorization is not beneficial"); 7708 VectorizeLoop = false; 7709 } 7710 7711 if (!MaybeVF && UserIC > 1) { 7712 // Tell the user interleaving was avoided up-front, despite being explicitly 7713 // requested. 7714 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7715 "interleaving should be avoided up front\n"); 7716 IntDiagMsg = std::make_pair( 7717 "InterleavingAvoided", 7718 "Ignoring UserIC, because interleaving was avoided up front"); 7719 InterleaveLoop = false; 7720 } else if (IC == 1 && UserIC <= 1) { 7721 // Tell the user interleaving is not beneficial. 7722 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7723 IntDiagMsg = std::make_pair( 7724 "InterleavingNotBeneficial", 7725 "the cost-model indicates that interleaving is not beneficial"); 7726 InterleaveLoop = false; 7727 if (UserIC == 1) { 7728 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7729 IntDiagMsg.second += 7730 " and is explicitly disabled or interleave count is set to 1"; 7731 } 7732 } else if (IC > 1 && UserIC == 1) { 7733 // Tell the user interleaving is beneficial, but it explicitly disabled. 7734 LLVM_DEBUG( 7735 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7736 IntDiagMsg = std::make_pair( 7737 "InterleavingBeneficialButDisabled", 7738 "the cost-model indicates that interleaving is beneficial " 7739 "but is explicitly disabled or interleave count is set to 1"); 7740 InterleaveLoop = false; 7741 } 7742 7743 // Override IC if user provided an interleave count. 7744 IC = UserIC > 0 ? UserIC : IC; 7745 7746 // Emit diagnostic messages, if any. 7747 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7748 if (!VectorizeLoop && !InterleaveLoop) { 7749 // Do not vectorize or interleaving the loop. 7750 ORE->emit([&]() { 7751 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7752 L->getStartLoc(), L->getHeader()) 7753 << VecDiagMsg.second; 7754 }); 7755 ORE->emit([&]() { 7756 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7757 L->getStartLoc(), L->getHeader()) 7758 << IntDiagMsg.second; 7759 }); 7760 return false; 7761 } else if (!VectorizeLoop && InterleaveLoop) { 7762 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7763 ORE->emit([&]() { 7764 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7765 L->getStartLoc(), L->getHeader()) 7766 << VecDiagMsg.second; 7767 }); 7768 } else if (VectorizeLoop && !InterleaveLoop) { 7769 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7770 << ") in " << DebugLocStr << '\n'); 7771 ORE->emit([&]() { 7772 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7773 L->getStartLoc(), L->getHeader()) 7774 << IntDiagMsg.second; 7775 }); 7776 } else if (VectorizeLoop && InterleaveLoop) { 7777 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7778 << ") in " << DebugLocStr << '\n'); 7779 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7780 } 7781 7782 LVP.setBestPlan(VF.Width, IC); 7783 7784 using namespace ore; 7785 bool DisableRuntimeUnroll = false; 7786 MDNode *OrigLoopID = L->getLoopID(); 7787 7788 if (!VectorizeLoop) { 7789 assert(IC > 1 && "interleave count should not be 1 or 0"); 7790 // If we decided that it is not legal to vectorize the loop, then 7791 // interleave it. 7792 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7793 &CM); 7794 LVP.executePlan(Unroller, DT); 7795 7796 ORE->emit([&]() { 7797 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7798 L->getHeader()) 7799 << "interleaved loop (interleaved count: " 7800 << NV("InterleaveCount", IC) << ")"; 7801 }); 7802 } else { 7803 // If we decided that it is *legal* to vectorize the loop, then do it. 7804 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7805 &LVL, &CM); 7806 LVP.executePlan(LB, DT); 7807 ++LoopsVectorized; 7808 7809 // Add metadata to disable runtime unrolling a scalar loop when there are 7810 // no runtime checks about strides and memory. A scalar loop that is 7811 // rarely used is not worth unrolling. 7812 if (!LB.areSafetyChecksAdded()) 7813 DisableRuntimeUnroll = true; 7814 7815 // Report the vectorization decision. 7816 ORE->emit([&]() { 7817 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7818 L->getHeader()) 7819 << "vectorized loop (vectorization width: " 7820 << NV("VectorizationFactor", VF.Width) 7821 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7822 }); 7823 } 7824 7825 Optional<MDNode *> RemainderLoopID = 7826 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7827 LLVMLoopVectorizeFollowupEpilogue}); 7828 if (RemainderLoopID.hasValue()) { 7829 L->setLoopID(RemainderLoopID.getValue()); 7830 } else { 7831 if (DisableRuntimeUnroll) 7832 AddRuntimeUnrollDisableMetaData(L); 7833 7834 // Mark the loop as already vectorized to avoid vectorizing again. 7835 Hints.setAlreadyVectorized(); 7836 } 7837 7838 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7839 return true; 7840 } 7841 7842 bool LoopVectorizePass::runImpl( 7843 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7844 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7845 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 7846 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 7847 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 7848 SE = &SE_; 7849 LI = &LI_; 7850 TTI = &TTI_; 7851 DT = &DT_; 7852 BFI = &BFI_; 7853 TLI = TLI_; 7854 AA = &AA_; 7855 AC = &AC_; 7856 GetLAA = &GetLAA_; 7857 DB = &DB_; 7858 ORE = &ORE_; 7859 PSI = PSI_; 7860 7861 // Don't attempt if 7862 // 1. the target claims to have no vector registers, and 7863 // 2. interleaving won't help ILP. 7864 // 7865 // The second condition is necessary because, even if the target has no 7866 // vector registers, loop vectorization may still enable scalar 7867 // interleaving. 7868 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 7869 TTI->getMaxInterleaveFactor(1) < 2) 7870 return false; 7871 7872 bool Changed = false; 7873 7874 // The vectorizer requires loops to be in simplified form. 7875 // Since simplification may add new inner loops, it has to run before the 7876 // legality and profitability checks. This means running the loop vectorizer 7877 // will simplify all loops, regardless of whether anything end up being 7878 // vectorized. 7879 for (auto &L : *LI) 7880 Changed |= 7881 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 7882 7883 // Build up a worklist of inner-loops to vectorize. This is necessary as 7884 // the act of vectorizing or partially unrolling a loop creates new loops 7885 // and can invalidate iterators across the loops. 7886 SmallVector<Loop *, 8> Worklist; 7887 7888 for (Loop *L : *LI) 7889 collectSupportedLoops(*L, LI, ORE, Worklist); 7890 7891 LoopsAnalyzed += Worklist.size(); 7892 7893 // Now walk the identified inner loops. 7894 while (!Worklist.empty()) { 7895 Loop *L = Worklist.pop_back_val(); 7896 7897 // For the inner loops we actually process, form LCSSA to simplify the 7898 // transform. 7899 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 7900 7901 Changed |= processLoop(L); 7902 } 7903 7904 // Process each loop nest in the function. 7905 return Changed; 7906 } 7907 7908 PreservedAnalyses LoopVectorizePass::run(Function &F, 7909 FunctionAnalysisManager &AM) { 7910 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 7911 auto &LI = AM.getResult<LoopAnalysis>(F); 7912 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 7913 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 7914 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 7915 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 7916 auto &AA = AM.getResult<AAManager>(F); 7917 auto &AC = AM.getResult<AssumptionAnalysis>(F); 7918 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 7919 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 7920 MemorySSA *MSSA = EnableMSSALoopDependency 7921 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 7922 : nullptr; 7923 7924 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 7925 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 7926 [&](Loop &L) -> const LoopAccessInfo & { 7927 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 7928 return LAM.getResult<LoopAccessAnalysis>(L, AR); 7929 }; 7930 const ModuleAnalysisManager &MAM = 7931 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); 7932 ProfileSummaryInfo *PSI = 7933 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 7934 bool Changed = 7935 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 7936 if (!Changed) 7937 return PreservedAnalyses::all(); 7938 PreservedAnalyses PA; 7939 7940 // We currently do not preserve loopinfo/dominator analyses with outer loop 7941 // vectorization. Until this is addressed, mark these analyses as preserved 7942 // only for non-VPlan-native path. 7943 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 7944 if (!EnableVPlanNativePath) { 7945 PA.preserve<LoopAnalysis>(); 7946 PA.preserve<DominatorTreeAnalysis>(); 7947 } 7948 PA.preserve<BasicAA>(); 7949 PA.preserve<GlobalsAA>(); 7950 return PA; 7951 } 7952