1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10 // and generates target-independent LLVM-IR. 11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12 // of instructions in order to estimate the profitability of vectorization. 13 // 14 // The loop vectorizer combines consecutive loop iterations into a single 15 // 'wide' iteration. After this transformation the index is incremented 16 // by the SIMD vector width, and not by one. 17 // 18 // This pass has three parts: 19 // 1. The main loop pass that drives the different parts. 20 // 2. LoopVectorizationLegality - A unit that checks for the legality 21 // of the vectorization. 22 // 3. InnerLoopVectorizer - A unit that performs the actual 23 // widening of instructions. 24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability 25 // of vectorization. It decides on the optimal vector width, which 26 // can be one, if vectorization is not profitable. 27 // 28 // There is a development effort going on to migrate loop vectorizer to the 29 // VPlan infrastructure and to introduce outer loop vectorization support (see 30 // docs/Proposal/VectorizationPlan.rst and 31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32 // purpose, we temporarily introduced the VPlan-native vectorization path: an 33 // alternative vectorization path that is natively implemented on top of the 34 // VPlan infrastructure. See EnableVPlanNativePath for enabling. 35 // 36 //===----------------------------------------------------------------------===// 37 // 38 // The reduction-variable vectorization is based on the paper: 39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 40 // 41 // Variable uniformity checks are inspired by: 42 // Karrenberg, R. and Hack, S. Whole Function Vectorization. 43 // 44 // The interleaved access vectorization is based on the paper: 45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 46 // Data for SIMD 47 // 48 // Other ideas/concepts are from: 49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 50 // 51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52 // Vectorizing Compilers. 53 // 54 //===----------------------------------------------------------------------===// 55 56 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 57 #include "LoopVectorizationPlanner.h" 58 #include "VPRecipeBuilder.h" 59 #include "VPlan.h" 60 #include "VPlanHCFGBuilder.h" 61 #include "VPlanPredicator.h" 62 #include "VPlanTransforms.h" 63 #include "llvm/ADT/APInt.h" 64 #include "llvm/ADT/ArrayRef.h" 65 #include "llvm/ADT/DenseMap.h" 66 #include "llvm/ADT/DenseMapInfo.h" 67 #include "llvm/ADT/Hashing.h" 68 #include "llvm/ADT/MapVector.h" 69 #include "llvm/ADT/None.h" 70 #include "llvm/ADT/Optional.h" 71 #include "llvm/ADT/STLExtras.h" 72 #include "llvm/ADT/SetVector.h" 73 #include "llvm/ADT/SmallPtrSet.h" 74 #include "llvm/ADT/SmallVector.h" 75 #include "llvm/ADT/Statistic.h" 76 #include "llvm/ADT/StringRef.h" 77 #include "llvm/ADT/Twine.h" 78 #include "llvm/ADT/iterator_range.h" 79 #include "llvm/Analysis/AssumptionCache.h" 80 #include "llvm/Analysis/BasicAliasAnalysis.h" 81 #include "llvm/Analysis/BlockFrequencyInfo.h" 82 #include "llvm/Analysis/CFG.h" 83 #include "llvm/Analysis/CodeMetrics.h" 84 #include "llvm/Analysis/DemandedBits.h" 85 #include "llvm/Analysis/GlobalsModRef.h" 86 #include "llvm/Analysis/LoopAccessAnalysis.h" 87 #include "llvm/Analysis/LoopAnalysisManager.h" 88 #include "llvm/Analysis/LoopInfo.h" 89 #include "llvm/Analysis/LoopIterator.h" 90 #include "llvm/Analysis/MemorySSA.h" 91 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 92 #include "llvm/Analysis/ProfileSummaryInfo.h" 93 #include "llvm/Analysis/ScalarEvolution.h" 94 #include "llvm/Analysis/ScalarEvolutionExpander.h" 95 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 96 #include "llvm/Analysis/TargetLibraryInfo.h" 97 #include "llvm/Analysis/TargetTransformInfo.h" 98 #include "llvm/Analysis/VectorUtils.h" 99 #include "llvm/IR/Attributes.h" 100 #include "llvm/IR/BasicBlock.h" 101 #include "llvm/IR/CFG.h" 102 #include "llvm/IR/Constant.h" 103 #include "llvm/IR/Constants.h" 104 #include "llvm/IR/DataLayout.h" 105 #include "llvm/IR/DebugInfoMetadata.h" 106 #include "llvm/IR/DebugLoc.h" 107 #include "llvm/IR/DerivedTypes.h" 108 #include "llvm/IR/DiagnosticInfo.h" 109 #include "llvm/IR/Dominators.h" 110 #include "llvm/IR/Function.h" 111 #include "llvm/IR/IRBuilder.h" 112 #include "llvm/IR/InstrTypes.h" 113 #include "llvm/IR/Instruction.h" 114 #include "llvm/IR/Instructions.h" 115 #include "llvm/IR/IntrinsicInst.h" 116 #include "llvm/IR/Intrinsics.h" 117 #include "llvm/IR/LLVMContext.h" 118 #include "llvm/IR/Metadata.h" 119 #include "llvm/IR/Module.h" 120 #include "llvm/IR/Operator.h" 121 #include "llvm/IR/Type.h" 122 #include "llvm/IR/Use.h" 123 #include "llvm/IR/User.h" 124 #include "llvm/IR/Value.h" 125 #include "llvm/IR/ValueHandle.h" 126 #include "llvm/IR/Verifier.h" 127 #include "llvm/InitializePasses.h" 128 #include "llvm/Pass.h" 129 #include "llvm/Support/Casting.h" 130 #include "llvm/Support/CommandLine.h" 131 #include "llvm/Support/Compiler.h" 132 #include "llvm/Support/Debug.h" 133 #include "llvm/Support/ErrorHandling.h" 134 #include "llvm/Support/MathExtras.h" 135 #include "llvm/Support/raw_ostream.h" 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 138 #include "llvm/Transforms/Utils/LoopSimplify.h" 139 #include "llvm/Transforms/Utils/LoopUtils.h" 140 #include "llvm/Transforms/Utils/LoopVersioning.h" 141 #include "llvm/Transforms/Utils/SizeOpts.h" 142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" 143 #include <algorithm> 144 #include <cassert> 145 #include <cstdint> 146 #include <cstdlib> 147 #include <functional> 148 #include <iterator> 149 #include <limits> 150 #include <memory> 151 #include <string> 152 #include <tuple> 153 #include <utility> 154 155 using namespace llvm; 156 157 #define LV_NAME "loop-vectorize" 158 #define DEBUG_TYPE LV_NAME 159 160 /// @{ 161 /// Metadata attribute names 162 static const char *const LLVMLoopVectorizeFollowupAll = 163 "llvm.loop.vectorize.followup_all"; 164 static const char *const LLVMLoopVectorizeFollowupVectorized = 165 "llvm.loop.vectorize.followup_vectorized"; 166 static const char *const LLVMLoopVectorizeFollowupEpilogue = 167 "llvm.loop.vectorize.followup_epilogue"; 168 /// @} 169 170 STATISTIC(LoopsVectorized, "Number of loops vectorized"); 171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); 172 173 /// Loops with a known constant trip count below this number are vectorized only 174 /// if no scalar iteration overheads are incurred. 175 static cl::opt<unsigned> TinyTripCountVectorThreshold( 176 "vectorizer-min-trip-count", cl::init(16), cl::Hidden, 177 cl::desc("Loops with a constant trip count that is smaller than this " 178 "value are vectorized only if no scalar iteration overheads " 179 "are incurred.")); 180 181 // Indicates that an epilogue is undesired, predication is preferred. 182 // This means that the vectorizer will try to fold the loop-tail (epilogue) 183 // into the loop and predicate the loop body accordingly. 184 static cl::opt<bool> PreferPredicateOverEpilog( 185 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, 186 cl::desc("Indicate that an epilogue is undesired, predication should be " 187 "used instead.")); 188 189 static cl::opt<bool> MaximizeBandwidth( 190 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, 191 cl::desc("Maximize bandwidth when selecting vectorization factor which " 192 "will be determined by the smallest type in loop.")); 193 194 static cl::opt<bool> EnableInterleavedMemAccesses( 195 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, 196 cl::desc("Enable vectorization on interleaved memory accesses in a loop")); 197 198 /// An interleave-group may need masking if it resides in a block that needs 199 /// predication, or in order to mask away gaps. 200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses( 201 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, 202 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); 203 204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold( 205 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, 206 cl::desc("We don't interleave loops with a estimated constant trip count " 207 "below this number")); 208 209 static cl::opt<unsigned> ForceTargetNumScalarRegs( 210 "force-target-num-scalar-regs", cl::init(0), cl::Hidden, 211 cl::desc("A flag that overrides the target's number of scalar registers.")); 212 213 static cl::opt<unsigned> ForceTargetNumVectorRegs( 214 "force-target-num-vector-regs", cl::init(0), cl::Hidden, 215 cl::desc("A flag that overrides the target's number of vector registers.")); 216 217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( 218 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, 219 cl::desc("A flag that overrides the target's max interleave factor for " 220 "scalar loops.")); 221 222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( 223 "force-target-max-vector-interleave", cl::init(0), cl::Hidden, 224 cl::desc("A flag that overrides the target's max interleave factor for " 225 "vectorized loops.")); 226 227 static cl::opt<unsigned> ForceTargetInstructionCost( 228 "force-target-instruction-cost", cl::init(0), cl::Hidden, 229 cl::desc("A flag that overrides the target's expected cost for " 230 "an instruction to a single constant value. Mostly " 231 "useful for getting consistent testing.")); 232 233 static cl::opt<unsigned> SmallLoopCost( 234 "small-loop-cost", cl::init(20), cl::Hidden, 235 cl::desc( 236 "The cost of a loop that is considered 'small' by the interleaver.")); 237 238 static cl::opt<bool> LoopVectorizeWithBlockFrequency( 239 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, 240 cl::desc("Enable the use of the block frequency analysis to access PGO " 241 "heuristics minimizing code growth in cold regions and being more " 242 "aggressive in hot regions.")); 243 244 // Runtime interleave loops for load/store throughput. 245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave( 246 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, 247 cl::desc( 248 "Enable runtime interleaving until load/store ports are saturated")); 249 250 /// The number of stores in a loop that are allowed to need predication. 251 static cl::opt<unsigned> NumberOfStoresToPredicate( 252 "vectorize-num-stores-pred", cl::init(1), cl::Hidden, 253 cl::desc("Max number of stores to be predicated behind an if.")); 254 255 static cl::opt<bool> EnableIndVarRegisterHeur( 256 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, 257 cl::desc("Count the induction variable only once when interleaving")); 258 259 static cl::opt<bool> EnableCondStoresVectorization( 260 "enable-cond-stores-vec", cl::init(true), cl::Hidden, 261 cl::desc("Enable if predication of stores during vectorization.")); 262 263 static cl::opt<unsigned> MaxNestedScalarReductionIC( 264 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, 265 cl::desc("The maximum interleave count to use when interleaving a scalar " 266 "reduction in a nested loop.")); 267 268 cl::opt<bool> EnableVPlanNativePath( 269 "enable-vplan-native-path", cl::init(false), cl::Hidden, 270 cl::desc("Enable VPlan-native vectorization path with " 271 "support for outer loop vectorization.")); 272 273 // FIXME: Remove this switch once we have divergence analysis. Currently we 274 // assume divergent non-backedge branches when this switch is true. 275 cl::opt<bool> EnableVPlanPredication( 276 "enable-vplan-predication", cl::init(false), cl::Hidden, 277 cl::desc("Enable VPlan-native vectorization path predicator with " 278 "support for outer loop vectorization.")); 279 280 // This flag enables the stress testing of the VPlan H-CFG construction in the 281 // VPlan-native vectorization path. It must be used in conjuction with 282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 283 // verification of the H-CFGs built. 284 static cl::opt<bool> VPlanBuildStressTest( 285 "vplan-build-stress-test", cl::init(false), cl::Hidden, 286 cl::desc( 287 "Build VPlan for every supported loop nest in the function and bail " 288 "out right after the build (stress test the VPlan H-CFG construction " 289 "in the VPlan-native vectorization path).")); 290 291 cl::opt<bool> llvm::EnableLoopInterleaving( 292 "interleave-loops", cl::init(true), cl::Hidden, 293 cl::desc("Enable loop interleaving in Loop vectorization passes")); 294 cl::opt<bool> llvm::EnableLoopVectorization( 295 "vectorize-loops", cl::init(true), cl::Hidden, 296 cl::desc("Run the Loop vectorization passes")); 297 298 /// A helper function that returns the type of loaded or stored value. 299 static Type *getMemInstValueType(Value *I) { 300 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && 301 "Expected Load or Store instruction"); 302 if (auto *LI = dyn_cast<LoadInst>(I)) 303 return LI->getType(); 304 return cast<StoreInst>(I)->getValueOperand()->getType(); 305 } 306 307 /// A helper function that returns true if the given type is irregular. The 308 /// type is irregular if its allocated size doesn't equal the store size of an 309 /// element of the corresponding vector type at the given vectorization factor. 310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) { 311 // Determine if an array of VF elements of type Ty is "bitcast compatible" 312 // with a <VF x Ty> vector. 313 if (VF > 1) { 314 auto *VectorTy = VectorType::get(Ty, VF); 315 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy); 316 } 317 318 // If the vectorization factor is one, we just check if an array of type Ty 319 // requires padding between elements. 320 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); 321 } 322 323 /// A helper function that returns the reciprocal of the block probability of 324 /// predicated blocks. If we return X, we are assuming the predicated block 325 /// will execute once for every X iterations of the loop header. 326 /// 327 /// TODO: We should use actual block probability here, if available. Currently, 328 /// we always assume predicated blocks have a 50% chance of executing. 329 static unsigned getReciprocalPredBlockProb() { return 2; } 330 331 /// A helper function that adds a 'fast' flag to floating-point operations. 332 static Value *addFastMathFlag(Value *V) { 333 if (isa<FPMathOperator>(V)) 334 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast()); 335 return V; 336 } 337 338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { 339 if (isa<FPMathOperator>(V)) 340 cast<Instruction>(V)->setFastMathFlags(FMF); 341 return V; 342 } 343 344 /// A helper function that returns an integer or floating-point constant with 345 /// value C. 346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { 347 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C) 348 : ConstantFP::get(Ty, C); 349 } 350 351 /// Returns "best known" trip count for the specified loop \p L as defined by 352 /// the following procedure: 353 /// 1) Returns exact trip count if it is known. 354 /// 2) Returns expected trip count according to profile data if any. 355 /// 3) Returns upper bound estimate if it is known. 356 /// 4) Returns None if all of the above failed. 357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { 358 // Check if exact trip count is known. 359 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) 360 return ExpectedTC; 361 362 // Check if there is an expected trip count available from profile data. 363 if (LoopVectorizeWithBlockFrequency) 364 if (auto EstimatedTC = getLoopEstimatedTripCount(L)) 365 return EstimatedTC; 366 367 // Check if upper bound estimate is known. 368 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) 369 return ExpectedTC; 370 371 return None; 372 } 373 374 namespace llvm { 375 376 /// InnerLoopVectorizer vectorizes loops which contain only one basic 377 /// block to a specified vectorization factor (VF). 378 /// This class performs the widening of scalars into vectors, or multiple 379 /// scalars. This class also implements the following features: 380 /// * It inserts an epilogue loop for handling loops that don't have iteration 381 /// counts that are known to be a multiple of the vectorization factor. 382 /// * It handles the code generation for reduction variables. 383 /// * Scalarization (implementation using scalars) of un-vectorizable 384 /// instructions. 385 /// InnerLoopVectorizer does not perform any vectorization-legality 386 /// checks, and relies on the caller to check for the different legality 387 /// aspects. The InnerLoopVectorizer relies on the 388 /// LoopVectorizationLegality class to provide information about the induction 389 /// and reduction variables that were found to a given vectorization factor. 390 class InnerLoopVectorizer { 391 public: 392 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 393 LoopInfo *LI, DominatorTree *DT, 394 const TargetLibraryInfo *TLI, 395 const TargetTransformInfo *TTI, AssumptionCache *AC, 396 OptimizationRemarkEmitter *ORE, unsigned VecWidth, 397 unsigned UnrollFactor, LoopVectorizationLegality *LVL, 398 LoopVectorizationCostModel *CM) 399 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), 400 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), 401 Builder(PSE.getSE()->getContext()), 402 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} 403 virtual ~InnerLoopVectorizer() = default; 404 405 /// Create a new empty loop. Unlink the old loop and connect the new one. 406 /// Return the pre-header block of the new loop. 407 BasicBlock *createVectorizedLoopSkeleton(); 408 409 /// Widen a single instruction within the innermost loop. 410 void widenInstruction(Instruction &I); 411 412 /// Fix the vectorized code, taking care of header phi's, live-outs, and more. 413 void fixVectorizedLoop(); 414 415 // Return true if any runtime check is added. 416 bool areSafetyChecksAdded() { return AddedSafetyChecks; } 417 418 /// A type for vectorized values in the new loop. Each value from the 419 /// original loop, when vectorized, is represented by UF vector values in the 420 /// new unrolled loop, where UF is the unroll factor. 421 using VectorParts = SmallVector<Value *, 2>; 422 423 /// Vectorize a single GetElementPtrInst based on information gathered and 424 /// decisions taken during planning. 425 void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, 426 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); 427 428 /// Vectorize a single PHINode in a block. This method handles the induction 429 /// variable canonicalization. It supports both VF = 1 for unrolled loops and 430 /// arbitrary length vectors. 431 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); 432 433 /// A helper function to scalarize a single Instruction in the innermost loop. 434 /// Generates a sequence of scalar instances for each lane between \p MinLane 435 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 436 /// inclusive.. 437 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, 438 bool IfPredicateInstr); 439 440 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc 441 /// is provided, the integer induction variable will first be truncated to 442 /// the corresponding type. 443 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); 444 445 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a 446 /// vector or scalar value on-demand if one is not yet available. When 447 /// vectorizing a loop, we visit the definition of an instruction before its 448 /// uses. When visiting the definition, we either vectorize or scalarize the 449 /// instruction, creating an entry for it in the corresponding map. (In some 450 /// cases, such as induction variables, we will create both vector and scalar 451 /// entries.) Then, as we encounter uses of the definition, we derive values 452 /// for each scalar or vector use unless such a value is already available. 453 /// For example, if we scalarize a definition and one of its uses is vector, 454 /// we build the required vector on-demand with an insertelement sequence 455 /// when visiting the use. Otherwise, if the use is scalar, we can use the 456 /// existing scalar definition. 457 /// 458 /// Return a value in the new loop corresponding to \p V from the original 459 /// loop at unroll index \p Part. If the value has already been vectorized, 460 /// the corresponding vector entry in VectorLoopValueMap is returned. If, 461 /// however, the value has a scalar entry in VectorLoopValueMap, we construct 462 /// a new vector value on-demand by inserting the scalar values into a vector 463 /// with an insertelement sequence. If the value has been neither vectorized 464 /// nor scalarized, it must be loop invariant, so we simply broadcast the 465 /// value into a vector. 466 Value *getOrCreateVectorValue(Value *V, unsigned Part); 467 468 /// Return a value in the new loop corresponding to \p V from the original 469 /// loop at unroll and vector indices \p Instance. If the value has been 470 /// vectorized but not scalarized, the necessary extractelement instruction 471 /// will be generated. 472 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); 473 474 /// Construct the vector value of a scalarized value \p V one lane at a time. 475 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); 476 477 /// Try to vectorize the interleaved access group that \p Instr belongs to 478 /// with the base address given in \p Addr, optionally masking the vector 479 /// operations if \p BlockInMask is non-null. Use \p State to translate given 480 /// VPValues to IR values in the vectorized loop. 481 void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State, 482 VPValue *Addr, VPValue *BlockInMask = nullptr); 483 484 /// Vectorize Load and Store instructions with the base address given in \p 485 /// Addr, optionally masking the vector operations if \p BlockInMask is 486 /// non-null. Use \p State to translate given VPValues to IR values in the 487 /// vectorized loop. 488 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, 489 VPValue *Addr, VPValue *StoredValue, 490 VPValue *BlockInMask); 491 492 /// Set the debug location in the builder using the debug location in 493 /// the instruction. 494 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); 495 496 /// Fix the non-induction PHIs in the OrigPHIsToFix vector. 497 void fixNonInductionPHIs(void); 498 499 protected: 500 friend class LoopVectorizationPlanner; 501 502 /// A small list of PHINodes. 503 using PhiVector = SmallVector<PHINode *, 4>; 504 505 /// A type for scalarized values in the new loop. Each value from the 506 /// original loop, when scalarized, is represented by UF x VF scalar values 507 /// in the new unrolled loop, where UF is the unroll factor and VF is the 508 /// vectorization factor. 509 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; 510 511 /// Set up the values of the IVs correctly when exiting the vector loop. 512 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, 513 Value *CountRoundDown, Value *EndValue, 514 BasicBlock *MiddleBlock); 515 516 /// Create a new induction variable inside L. 517 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, 518 Value *Step, Instruction *DL); 519 520 /// Handle all cross-iteration phis in the header. 521 void fixCrossIterationPHIs(); 522 523 /// Fix a first-order recurrence. This is the second phase of vectorizing 524 /// this phi node. 525 void fixFirstOrderRecurrence(PHINode *Phi); 526 527 /// Fix a reduction cross-iteration phi. This is the second phase of 528 /// vectorizing this phi node. 529 void fixReduction(PHINode *Phi); 530 531 /// Clear NSW/NUW flags from reduction instructions if necessary. 532 void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); 533 534 /// The Loop exit block may have single value PHI nodes with some 535 /// incoming value. While vectorizing we only handled real values 536 /// that were defined inside the loop and we should have one value for 537 /// each predecessor of its parent basic block. See PR14725. 538 void fixLCSSAPHIs(); 539 540 /// Iteratively sink the scalarized operands of a predicated instruction into 541 /// the block that was created for it. 542 void sinkScalarOperands(Instruction *PredInst); 543 544 /// Shrinks vector element sizes to the smallest bitwidth they can be legally 545 /// represented as. 546 void truncateToMinimalBitwidths(); 547 548 /// Create a broadcast instruction. This method generates a broadcast 549 /// instruction (shuffle) for loop invariant values and for the induction 550 /// value. If this is the induction variable then we extend it to N, N+1, ... 551 /// this is needed because each iteration in the loop corresponds to a SIMD 552 /// element. 553 virtual Value *getBroadcastInstrs(Value *V); 554 555 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) 556 /// to each vector element of Val. The sequence starts at StartIndex. 557 /// \p Opcode is relevant for FP induction variable. 558 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step, 559 Instruction::BinaryOps Opcode = 560 Instruction::BinaryOpsEnd); 561 562 /// Compute scalar induction steps. \p ScalarIV is the scalar induction 563 /// variable on which to base the steps, \p Step is the size of the step, and 564 /// \p EntryVal is the value from the original loop that maps to the steps. 565 /// Note that \p EntryVal doesn't have to be an induction variable - it 566 /// can also be a truncate instruction. 567 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, 568 const InductionDescriptor &ID); 569 570 /// Create a vector induction phi node based on an existing scalar one. \p 571 /// EntryVal is the value from the original loop that maps to the vector phi 572 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a 573 /// truncate instruction, instead of widening the original IV, we widen a 574 /// version of the IV truncated to \p EntryVal's type. 575 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, 576 Value *Step, Instruction *EntryVal); 577 578 /// Returns true if an instruction \p I should be scalarized instead of 579 /// vectorized for the chosen vectorization factor. 580 bool shouldScalarizeInstruction(Instruction *I) const; 581 582 /// Returns true if we should generate a scalar version of \p IV. 583 bool needsScalarInduction(Instruction *IV) const; 584 585 /// If there is a cast involved in the induction variable \p ID, which should 586 /// be ignored in the vectorized loop body, this function records the 587 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 588 /// cast. We had already proved that the casted Phi is equal to the uncasted 589 /// Phi in the vectorized loop (under a runtime guard), and therefore 590 /// there is no need to vectorize the cast - the same value can be used in the 591 /// vector loop for both the Phi and the cast. 592 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, 593 /// Otherwise, \p VectorLoopValue is a widened/vectorized value. 594 /// 595 /// \p EntryVal is the value from the original loop that maps to the vector 596 /// phi node and is used to distinguish what is the IV currently being 597 /// processed - original one (if \p EntryVal is a phi corresponding to the 598 /// original IV) or the "newly-created" one based on the proof mentioned above 599 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the 600 /// latter case \p EntryVal is a TruncInst and we must not record anything for 601 /// that IV, but it's error-prone to expect callers of this routine to care 602 /// about that, hence this explicit parameter. 603 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID, 604 const Instruction *EntryVal, 605 Value *VectorLoopValue, 606 unsigned Part, 607 unsigned Lane = UINT_MAX); 608 609 /// Generate a shuffle sequence that will reverse the vector Vec. 610 virtual Value *reverseVector(Value *Vec); 611 612 /// Returns (and creates if needed) the original loop trip count. 613 Value *getOrCreateTripCount(Loop *NewLoop); 614 615 /// Returns (and creates if needed) the trip count of the widened loop. 616 Value *getOrCreateVectorTripCount(Loop *NewLoop); 617 618 /// Returns a bitcasted value to the requested vector type. 619 /// Also handles bitcasts of vector<float> <-> vector<pointer> types. 620 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, 621 const DataLayout &DL); 622 623 /// Emit a bypass check to see if the vector trip count is zero, including if 624 /// it overflows. 625 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); 626 627 /// Emit a bypass check to see if all of the SCEV assumptions we've 628 /// had to make are correct. 629 void emitSCEVChecks(Loop *L, BasicBlock *Bypass); 630 631 /// Emit bypass checks to check any memory assumptions we may have made. 632 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); 633 634 /// Compute the transformed value of Index at offset StartValue using step 635 /// StepValue. 636 /// For integer induction, returns StartValue + Index * StepValue. 637 /// For pointer induction, returns StartValue[Index * StepValue]. 638 /// FIXME: The newly created binary instructions should contain nsw/nuw 639 /// flags, which can be found from the original scalar operations. 640 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, 641 const DataLayout &DL, 642 const InductionDescriptor &ID) const; 643 644 /// Add additional metadata to \p To that was not present on \p Orig. 645 /// 646 /// Currently this is used to add the noalias annotations based on the 647 /// inserted memchecks. Use this for instructions that are *cloned* into the 648 /// vector loop. 649 void addNewMetadata(Instruction *To, const Instruction *Orig); 650 651 /// Add metadata from one instruction to another. 652 /// 653 /// This includes both the original MDs from \p From and additional ones (\see 654 /// addNewMetadata). Use this for *newly created* instructions in the vector 655 /// loop. 656 void addMetadata(Instruction *To, Instruction *From); 657 658 /// Similar to the previous function but it adds the metadata to a 659 /// vector of instructions. 660 void addMetadata(ArrayRef<Value *> To, Instruction *From); 661 662 /// The original loop. 663 Loop *OrigLoop; 664 665 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 666 /// dynamic knowledge to simplify SCEV expressions and converts them to a 667 /// more usable form. 668 PredicatedScalarEvolution &PSE; 669 670 /// Loop Info. 671 LoopInfo *LI; 672 673 /// Dominator Tree. 674 DominatorTree *DT; 675 676 /// Alias Analysis. 677 AliasAnalysis *AA; 678 679 /// Target Library Info. 680 const TargetLibraryInfo *TLI; 681 682 /// Target Transform Info. 683 const TargetTransformInfo *TTI; 684 685 /// Assumption Cache. 686 AssumptionCache *AC; 687 688 /// Interface to emit optimization remarks. 689 OptimizationRemarkEmitter *ORE; 690 691 /// LoopVersioning. It's only set up (non-null) if memchecks were 692 /// used. 693 /// 694 /// This is currently only used to add no-alias metadata based on the 695 /// memchecks. The actually versioning is performed manually. 696 std::unique_ptr<LoopVersioning> LVer; 697 698 /// The vectorization SIMD factor to use. Each vector will have this many 699 /// vector elements. 700 unsigned VF; 701 702 /// The vectorization unroll factor to use. Each scalar is vectorized to this 703 /// many different vector instructions. 704 unsigned UF; 705 706 /// The builder that we use 707 IRBuilder<> Builder; 708 709 // --- Vectorization state --- 710 711 /// The vector-loop preheader. 712 BasicBlock *LoopVectorPreHeader; 713 714 /// The scalar-loop preheader. 715 BasicBlock *LoopScalarPreHeader; 716 717 /// Middle Block between the vector and the scalar. 718 BasicBlock *LoopMiddleBlock; 719 720 /// The ExitBlock of the scalar loop. 721 BasicBlock *LoopExitBlock; 722 723 /// The vector loop body. 724 BasicBlock *LoopVectorBody; 725 726 /// The scalar loop body. 727 BasicBlock *LoopScalarBody; 728 729 /// A list of all bypass blocks. The first block is the entry of the loop. 730 SmallVector<BasicBlock *, 4> LoopBypassBlocks; 731 732 /// The new Induction variable which was added to the new block. 733 PHINode *Induction = nullptr; 734 735 /// The induction variable of the old basic block. 736 PHINode *OldInduction = nullptr; 737 738 /// Maps values from the original loop to their corresponding values in the 739 /// vectorized loop. A key value can map to either vector values, scalar 740 /// values or both kinds of values, depending on whether the key was 741 /// vectorized and scalarized. 742 VectorizerValueMap VectorLoopValueMap; 743 744 /// Store instructions that were predicated. 745 SmallVector<Instruction *, 4> PredicatedInstructions; 746 747 /// Trip count of the original loop. 748 Value *TripCount = nullptr; 749 750 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 751 Value *VectorTripCount = nullptr; 752 753 /// The legality analysis. 754 LoopVectorizationLegality *Legal; 755 756 /// The profitablity analysis. 757 LoopVectorizationCostModel *Cost; 758 759 // Record whether runtime checks are added. 760 bool AddedSafetyChecks = false; 761 762 // Holds the end values for each induction variable. We save the end values 763 // so we can later fix-up the external users of the induction variables. 764 DenseMap<PHINode *, Value *> IVEndValues; 765 766 // Vector of original scalar PHIs whose corresponding widened PHIs need to be 767 // fixed up at the end of vector code generation. 768 SmallVector<PHINode *, 8> OrigPHIsToFix; 769 }; 770 771 class InnerLoopUnroller : public InnerLoopVectorizer { 772 public: 773 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, 774 LoopInfo *LI, DominatorTree *DT, 775 const TargetLibraryInfo *TLI, 776 const TargetTransformInfo *TTI, AssumptionCache *AC, 777 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, 778 LoopVectorizationLegality *LVL, 779 LoopVectorizationCostModel *CM) 780 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, 781 UnrollFactor, LVL, CM) {} 782 783 private: 784 Value *getBroadcastInstrs(Value *V) override; 785 Value *getStepVector(Value *Val, int StartIdx, Value *Step, 786 Instruction::BinaryOps Opcode = 787 Instruction::BinaryOpsEnd) override; 788 Value *reverseVector(Value *Vec) override; 789 }; 790 791 } // end namespace llvm 792 793 /// Look for a meaningful debug location on the instruction or it's 794 /// operands. 795 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) { 796 if (!I) 797 return I; 798 799 DebugLoc Empty; 800 if (I->getDebugLoc() != Empty) 801 return I; 802 803 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) { 804 if (Instruction *OpInst = dyn_cast<Instruction>(*OI)) 805 if (OpInst->getDebugLoc() != Empty) 806 return OpInst; 807 } 808 809 return I; 810 } 811 812 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { 813 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) { 814 const DILocation *DIL = Inst->getDebugLoc(); 815 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && 816 !isa<DbgInfoIntrinsic>(Inst)) { 817 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF); 818 if (NewDIL) 819 B.SetCurrentDebugLocation(NewDIL.getValue()); 820 else 821 LLVM_DEBUG(dbgs() 822 << "Failed to create new discriminator: " 823 << DIL->getFilename() << " Line: " << DIL->getLine()); 824 } 825 else 826 B.SetCurrentDebugLocation(DIL); 827 } else 828 B.SetCurrentDebugLocation(DebugLoc()); 829 } 830 831 /// Write a record \p DebugMsg about vectorization failure to the debug 832 /// output stream. If \p I is passed, it is an instruction that prevents 833 /// vectorization. 834 #ifndef NDEBUG 835 static void debugVectorizationFailure(const StringRef DebugMsg, 836 Instruction *I) { 837 dbgs() << "LV: Not vectorizing: " << DebugMsg; 838 if (I != nullptr) 839 dbgs() << " " << *I; 840 else 841 dbgs() << '.'; 842 dbgs() << '\n'; 843 } 844 #endif 845 846 /// Create an analysis remark that explains why vectorization failed 847 /// 848 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 849 /// RemarkName is the identifier for the remark. If \p I is passed it is an 850 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for 851 /// the location of the remark. \return the remark object that can be 852 /// streamed to. 853 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, 854 StringRef RemarkName, Loop *TheLoop, Instruction *I) { 855 Value *CodeRegion = TheLoop->getHeader(); 856 DebugLoc DL = TheLoop->getStartLoc(); 857 858 if (I) { 859 CodeRegion = I->getParent(); 860 // If there is no debug location attached to the instruction, revert back to 861 // using the loop's. 862 if (I->getDebugLoc()) 863 DL = I->getDebugLoc(); 864 } 865 866 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); 867 R << "loop not vectorized: "; 868 return R; 869 } 870 871 namespace llvm { 872 873 void reportVectorizationFailure(const StringRef DebugMsg, 874 const StringRef OREMsg, const StringRef ORETag, 875 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { 876 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); 877 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); 878 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), 879 ORETag, TheLoop, I) << OREMsg); 880 } 881 882 } // end namespace llvm 883 884 #ifndef NDEBUG 885 /// \return string containing a file name and a line # for the given loop. 886 static std::string getDebugLocString(const Loop *L) { 887 std::string Result; 888 if (L) { 889 raw_string_ostream OS(Result); 890 if (const DebugLoc LoopDbgLoc = L->getStartLoc()) 891 LoopDbgLoc.print(OS); 892 else 893 // Just print the module name. 894 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); 895 OS.flush(); 896 } 897 return Result; 898 } 899 #endif 900 901 void InnerLoopVectorizer::addNewMetadata(Instruction *To, 902 const Instruction *Orig) { 903 // If the loop was versioned with memchecks, add the corresponding no-alias 904 // metadata. 905 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig))) 906 LVer->annotateInstWithNoAlias(To, Orig); 907 } 908 909 void InnerLoopVectorizer::addMetadata(Instruction *To, 910 Instruction *From) { 911 propagateMetadata(To, From); 912 addNewMetadata(To, From); 913 } 914 915 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, 916 Instruction *From) { 917 for (Value *V : To) { 918 if (Instruction *I = dyn_cast<Instruction>(V)) 919 addMetadata(I, From); 920 } 921 } 922 923 namespace llvm { 924 925 // Loop vectorization cost-model hints how the scalar epilogue loop should be 926 // lowered. 927 enum ScalarEpilogueLowering { 928 929 // The default: allowing scalar epilogues. 930 CM_ScalarEpilogueAllowed, 931 932 // Vectorization with OptForSize: don't allow epilogues. 933 CM_ScalarEpilogueNotAllowedOptSize, 934 935 // A special case of vectorisation with OptForSize: loops with a very small 936 // trip count are considered for vectorization under OptForSize, thereby 937 // making sure the cost of their loop body is dominant, free of runtime 938 // guards and scalar iteration overheads. 939 CM_ScalarEpilogueNotAllowedLowTripLoop, 940 941 // Loop hint predicate indicating an epilogue is undesired. 942 CM_ScalarEpilogueNotNeededUsePredicate 943 }; 944 945 /// LoopVectorizationCostModel - estimates the expected speedups due to 946 /// vectorization. 947 /// In many cases vectorization is not profitable. This can happen because of 948 /// a number of reasons. In this class we mainly attempt to predict the 949 /// expected speedup/slowdowns due to the supported instruction set. We use the 950 /// TargetTransformInfo to query the different backends for the cost of 951 /// different operations. 952 class LoopVectorizationCostModel { 953 public: 954 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, 955 PredicatedScalarEvolution &PSE, LoopInfo *LI, 956 LoopVectorizationLegality *Legal, 957 const TargetTransformInfo &TTI, 958 const TargetLibraryInfo *TLI, DemandedBits *DB, 959 AssumptionCache *AC, 960 OptimizationRemarkEmitter *ORE, const Function *F, 961 const LoopVectorizeHints *Hints, 962 InterleavedAccessInfo &IAI) 963 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), 964 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), 965 Hints(Hints), InterleaveInfo(IAI) {} 966 967 /// \return An upper bound for the vectorization factor, or None if 968 /// vectorization and interleaving should be avoided up front. 969 Optional<unsigned> computeMaxVF(); 970 971 /// \return True if runtime checks are required for vectorization, and false 972 /// otherwise. 973 bool runtimeChecksRequired(); 974 975 /// \return The most profitable vectorization factor and the cost of that VF. 976 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO 977 /// then this vectorization factor will be selected if vectorization is 978 /// possible. 979 VectorizationFactor selectVectorizationFactor(unsigned MaxVF); 980 981 /// Setup cost-based decisions for user vectorization factor. 982 void selectUserVectorizationFactor(unsigned UserVF) { 983 collectUniformsAndScalars(UserVF); 984 collectInstsToScalarize(UserVF); 985 } 986 987 /// \return The size (in bits) of the smallest and widest types in the code 988 /// that needs to be vectorized. We ignore values that remain scalar such as 989 /// 64 bit loop indices. 990 std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); 991 992 /// \return The desired interleave count. 993 /// If interleave count has been specified by metadata it will be returned. 994 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 995 /// are the selected vectorization factor and the cost of the selected VF. 996 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); 997 998 /// Memory access instruction may be vectorized in more than one way. 999 /// Form of instruction after vectorization depends on cost. 1000 /// This function takes cost-based decisions for Load/Store instructions 1001 /// and collects them in a map. This decisions map is used for building 1002 /// the lists of loop-uniform and loop-scalar instructions. 1003 /// The calculated cost is saved with widening decision in order to 1004 /// avoid redundant calculations. 1005 void setCostBasedWideningDecision(unsigned VF); 1006 1007 /// A struct that represents some properties of the register usage 1008 /// of a loop. 1009 struct RegisterUsage { 1010 /// Holds the number of loop invariant values that are used in the loop. 1011 /// The key is ClassID of target-provided register class. 1012 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; 1013 /// Holds the maximum number of concurrent live intervals in the loop. 1014 /// The key is ClassID of target-provided register class. 1015 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; 1016 }; 1017 1018 /// \return Returns information about the register usages of the loop for the 1019 /// given vectorization factors. 1020 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs); 1021 1022 /// Collect values we want to ignore in the cost model. 1023 void collectValuesToIgnore(); 1024 1025 /// \returns The smallest bitwidth each instruction can be represented with. 1026 /// The vector equivalents of these instructions should be truncated to this 1027 /// type. 1028 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { 1029 return MinBWs; 1030 } 1031 1032 /// \returns True if it is more profitable to scalarize instruction \p I for 1033 /// vectorization factor \p VF. 1034 bool isProfitableToScalarize(Instruction *I, unsigned VF) const { 1035 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); 1036 1037 // Cost model is not run in the VPlan-native path - return conservative 1038 // result until this changes. 1039 if (EnableVPlanNativePath) 1040 return false; 1041 1042 auto Scalars = InstsToScalarize.find(VF); 1043 assert(Scalars != InstsToScalarize.end() && 1044 "VF not yet analyzed for scalarization profitability"); 1045 return Scalars->second.find(I) != Scalars->second.end(); 1046 } 1047 1048 /// Returns true if \p I is known to be uniform after vectorization. 1049 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const { 1050 if (VF == 1) 1051 return true; 1052 1053 // Cost model is not run in the VPlan-native path - return conservative 1054 // result until this changes. 1055 if (EnableVPlanNativePath) 1056 return false; 1057 1058 auto UniformsPerVF = Uniforms.find(VF); 1059 assert(UniformsPerVF != Uniforms.end() && 1060 "VF not yet analyzed for uniformity"); 1061 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end(); 1062 } 1063 1064 /// Returns true if \p I is known to be scalar after vectorization. 1065 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const { 1066 if (VF == 1) 1067 return true; 1068 1069 // Cost model is not run in the VPlan-native path - return conservative 1070 // result until this changes. 1071 if (EnableVPlanNativePath) 1072 return false; 1073 1074 auto ScalarsPerVF = Scalars.find(VF); 1075 assert(ScalarsPerVF != Scalars.end() && 1076 "Scalar values are not calculated for VF"); 1077 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end(); 1078 } 1079 1080 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1081 /// for vectorization factor \p VF. 1082 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const { 1083 return VF > 1 && MinBWs.find(I) != MinBWs.end() && 1084 !isProfitableToScalarize(I, VF) && 1085 !isScalarAfterVectorization(I, VF); 1086 } 1087 1088 /// Decision that was taken during cost calculation for memory instruction. 1089 enum InstWidening { 1090 CM_Unknown, 1091 CM_Widen, // For consecutive accesses with stride +1. 1092 CM_Widen_Reverse, // For consecutive accesses with stride -1. 1093 CM_Interleave, 1094 CM_GatherScatter, 1095 CM_Scalarize 1096 }; 1097 1098 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1099 /// instruction \p I and vector width \p VF. 1100 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, 1101 unsigned Cost) { 1102 assert(VF >= 2 && "Expected VF >=2"); 1103 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1104 } 1105 1106 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1107 /// interleaving group \p Grp and vector width \p VF. 1108 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF, 1109 InstWidening W, unsigned Cost) { 1110 assert(VF >= 2 && "Expected VF >=2"); 1111 /// Broadcast this decicion to all instructions inside the group. 1112 /// But the cost will be assigned to one instruction only. 1113 for (unsigned i = 0; i < Grp->getFactor(); ++i) { 1114 if (auto *I = Grp->getMember(i)) { 1115 if (Grp->getInsertPos() == I) 1116 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); 1117 else 1118 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); 1119 } 1120 } 1121 } 1122 1123 /// Return the cost model decision for the given instruction \p I and vector 1124 /// width \p VF. Return CM_Unknown if this instruction did not pass 1125 /// through the cost modeling. 1126 InstWidening getWideningDecision(Instruction *I, unsigned VF) { 1127 assert(VF >= 2 && "Expected VF >=2"); 1128 1129 // Cost model is not run in the VPlan-native path - return conservative 1130 // result until this changes. 1131 if (EnableVPlanNativePath) 1132 return CM_GatherScatter; 1133 1134 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1135 auto Itr = WideningDecisions.find(InstOnVF); 1136 if (Itr == WideningDecisions.end()) 1137 return CM_Unknown; 1138 return Itr->second.first; 1139 } 1140 1141 /// Return the vectorization cost for the given instruction \p I and vector 1142 /// width \p VF. 1143 unsigned getWideningCost(Instruction *I, unsigned VF) { 1144 assert(VF >= 2 && "Expected VF >=2"); 1145 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF); 1146 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && 1147 "The cost is not calculated"); 1148 return WideningDecisions[InstOnVF].second; 1149 } 1150 1151 /// Return True if instruction \p I is an optimizable truncate whose operand 1152 /// is an induction variable. Such a truncate will be removed by adding a new 1153 /// induction variable with the destination type. 1154 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) { 1155 // If the instruction is not a truncate, return false. 1156 auto *Trunc = dyn_cast<TruncInst>(I); 1157 if (!Trunc) 1158 return false; 1159 1160 // Get the source and destination types of the truncate. 1161 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF); 1162 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF); 1163 1164 // If the truncate is free for the given types, return false. Replacing a 1165 // free truncate with an induction variable would add an induction variable 1166 // update instruction to each iteration of the loop. We exclude from this 1167 // check the primary induction variable since it will need an update 1168 // instruction regardless. 1169 Value *Op = Trunc->getOperand(0); 1170 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy)) 1171 return false; 1172 1173 // If the truncated value is not an induction variable, return false. 1174 return Legal->isInductionPhi(Op); 1175 } 1176 1177 /// Collects the instructions to scalarize for each predicated instruction in 1178 /// the loop. 1179 void collectInstsToScalarize(unsigned VF); 1180 1181 /// Collect Uniform and Scalar values for the given \p VF. 1182 /// The sets depend on CM decision for Load/Store instructions 1183 /// that may be vectorized as interleave, gather-scatter or scalarized. 1184 void collectUniformsAndScalars(unsigned VF) { 1185 // Do the analysis once. 1186 if (VF == 1 || Uniforms.find(VF) != Uniforms.end()) 1187 return; 1188 setCostBasedWideningDecision(VF); 1189 collectLoopUniforms(VF); 1190 collectLoopScalars(VF); 1191 } 1192 1193 /// Returns true if the target machine supports masked store operation 1194 /// for the given \p DataType and kind of access to \p Ptr. 1195 bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1196 return Legal->isConsecutivePtr(Ptr) && 1197 TTI.isLegalMaskedStore(DataType, Alignment); 1198 } 1199 1200 /// Returns true if the target machine supports masked load operation 1201 /// for the given \p DataType and kind of access to \p Ptr. 1202 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) { 1203 return Legal->isConsecutivePtr(Ptr) && 1204 TTI.isLegalMaskedLoad(DataType, Alignment); 1205 } 1206 1207 /// Returns true if the target machine supports masked scatter operation 1208 /// for the given \p DataType. 1209 bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) { 1210 return TTI.isLegalMaskedScatter(DataType, Alignment); 1211 } 1212 1213 /// Returns true if the target machine supports masked gather operation 1214 /// for the given \p DataType. 1215 bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) { 1216 return TTI.isLegalMaskedGather(DataType, Alignment); 1217 } 1218 1219 /// Returns true if the target machine can represent \p V as a masked gather 1220 /// or scatter operation. 1221 bool isLegalGatherOrScatter(Value *V) { 1222 bool LI = isa<LoadInst>(V); 1223 bool SI = isa<StoreInst>(V); 1224 if (!LI && !SI) 1225 return false; 1226 auto *Ty = getMemInstValueType(V); 1227 MaybeAlign Align = getLoadStoreAlignment(V); 1228 return (LI && isLegalMaskedGather(Ty, Align)) || 1229 (SI && isLegalMaskedScatter(Ty, Align)); 1230 } 1231 1232 /// Returns true if \p I is an instruction that will be scalarized with 1233 /// predication. Such instructions include conditional stores and 1234 /// instructions that may divide by zero. 1235 /// If a non-zero VF has been calculated, we check if I will be scalarized 1236 /// predication for that VF. 1237 bool isScalarWithPredication(Instruction *I, unsigned VF = 1); 1238 1239 // Returns true if \p I is an instruction that will be predicated either 1240 // through scalar predication or masked load/store or masked gather/scatter. 1241 // Superset of instructions that return true for isScalarWithPredication. 1242 bool isPredicatedInst(Instruction *I) { 1243 if (!blockNeedsPredication(I->getParent())) 1244 return false; 1245 // Loads and stores that need some form of masked operation are predicated 1246 // instructions. 1247 if (isa<LoadInst>(I) || isa<StoreInst>(I)) 1248 return Legal->isMaskRequired(I); 1249 return isScalarWithPredication(I); 1250 } 1251 1252 /// Returns true if \p I is a memory instruction with consecutive memory 1253 /// access that can be widened. 1254 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); 1255 1256 /// Returns true if \p I is a memory instruction in an interleaved-group 1257 /// of memory accesses that can be vectorized with wide vector loads/stores 1258 /// and shuffles. 1259 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); 1260 1261 /// Check if \p Instr belongs to any interleaved access group. 1262 bool isAccessInterleaved(Instruction *Instr) { 1263 return InterleaveInfo.isInterleaved(Instr); 1264 } 1265 1266 /// Get the interleaved access group that \p Instr belongs to. 1267 const InterleaveGroup<Instruction> * 1268 getInterleavedAccessGroup(Instruction *Instr) { 1269 return InterleaveInfo.getInterleaveGroup(Instr); 1270 } 1271 1272 /// Returns true if an interleaved group requires a scalar iteration 1273 /// to handle accesses with gaps, and there is nothing preventing us from 1274 /// creating a scalar epilogue. 1275 bool requiresScalarEpilogue() const { 1276 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); 1277 } 1278 1279 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1280 /// loop hint annotation. 1281 bool isScalarEpilogueAllowed() const { 1282 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; 1283 } 1284 1285 /// Returns true if all loop blocks should be masked to fold tail loop. 1286 bool foldTailByMasking() const { return FoldTailByMasking; } 1287 1288 bool blockNeedsPredication(BasicBlock *BB) { 1289 return foldTailByMasking() || Legal->blockNeedsPredication(BB); 1290 } 1291 1292 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1293 /// with factor VF. Return the cost of the instruction, including 1294 /// scalarization overhead if it's needed. 1295 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); 1296 1297 /// Estimate cost of a call instruction CI if it were vectorized with factor 1298 /// VF. Return the cost of the instruction, including scalarization overhead 1299 /// if it's needed. The flag NeedToScalarize shows if the call needs to be 1300 /// scalarized - 1301 /// i.e. either vector version isn't available, or is too expensive. 1302 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); 1303 1304 private: 1305 unsigned NumPredStores = 0; 1306 1307 /// \return An upper bound for the vectorization factor, larger than zero. 1308 /// One is returned if vectorization should best be avoided due to cost. 1309 unsigned computeFeasibleMaxVF(unsigned ConstTripCount); 1310 1311 /// The vectorization cost is a combination of the cost itself and a boolean 1312 /// indicating whether any of the contributing operations will actually 1313 /// operate on 1314 /// vector values after type legalization in the backend. If this latter value 1315 /// is 1316 /// false, then all operations will be scalarized (i.e. no vectorization has 1317 /// actually taken place). 1318 using VectorizationCostTy = std::pair<unsigned, bool>; 1319 1320 /// Returns the expected execution cost. The unit of the cost does 1321 /// not matter because we use the 'cost' units to compare different 1322 /// vector widths. The cost that is returned is *not* normalized by 1323 /// the factor width. 1324 VectorizationCostTy expectedCost(unsigned VF); 1325 1326 /// Returns the execution time cost of an instruction for a given vector 1327 /// width. Vector width of one means scalar. 1328 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF); 1329 1330 /// The cost-computation logic from getInstructionCost which provides 1331 /// the vector type as an output parameter. 1332 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy); 1333 1334 /// Calculate vectorization cost of memory instruction \p I. 1335 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF); 1336 1337 /// The cost computation for scalarized memory instruction. 1338 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF); 1339 1340 /// The cost computation for interleaving group of memory instructions. 1341 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF); 1342 1343 /// The cost computation for Gather/Scatter instruction. 1344 unsigned getGatherScatterCost(Instruction *I, unsigned VF); 1345 1346 /// The cost computation for widening instruction \p I with consecutive 1347 /// memory access. 1348 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF); 1349 1350 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1351 /// Load: scalar load + broadcast. 1352 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1353 /// element) 1354 unsigned getUniformMemOpCost(Instruction *I, unsigned VF); 1355 1356 /// Estimate the overhead of scalarizing an instruction. This is a 1357 /// convenience wrapper for the type-based getScalarizationOverhead API. 1358 unsigned getScalarizationOverhead(Instruction *I, unsigned VF); 1359 1360 /// Returns whether the instruction is a load or store and will be a emitted 1361 /// as a vector operation. 1362 bool isConsecutiveLoadOrStore(Instruction *I); 1363 1364 /// Returns true if an artificially high cost for emulated masked memrefs 1365 /// should be used. 1366 bool useEmulatedMaskMemRefHack(Instruction *I); 1367 1368 /// Map of scalar integer values to the smallest bitwidth they can be legally 1369 /// represented as. The vector equivalents of these values should be truncated 1370 /// to this type. 1371 MapVector<Instruction *, uint64_t> MinBWs; 1372 1373 /// A type representing the costs for instructions if they were to be 1374 /// scalarized rather than vectorized. The entries are Instruction-Cost 1375 /// pairs. 1376 using ScalarCostsTy = DenseMap<Instruction *, unsigned>; 1377 1378 /// A set containing all BasicBlocks that are known to present after 1379 /// vectorization as a predicated block. 1380 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; 1381 1382 /// Records whether it is allowed to have the original scalar loop execute at 1383 /// least once. This may be needed as a fallback loop in case runtime 1384 /// aliasing/dependence checks fail, or to handle the tail/remainder 1385 /// iterations when the trip count is unknown or doesn't divide by the VF, 1386 /// or as a peel-loop to handle gaps in interleave-groups. 1387 /// Under optsize and when the trip count is very small we don't allow any 1388 /// iterations to execute in the scalar loop. 1389 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; 1390 1391 /// All blocks of loop are to be masked to fold tail of scalar iterations. 1392 bool FoldTailByMasking = false; 1393 1394 /// A map holding scalar costs for different vectorization factors. The 1395 /// presence of a cost for an instruction in the mapping indicates that the 1396 /// instruction will be scalarized when vectorizing with the associated 1397 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1398 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize; 1399 1400 /// Holds the instructions known to be uniform after vectorization. 1401 /// The data is collected per VF. 1402 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms; 1403 1404 /// Holds the instructions known to be scalar after vectorization. 1405 /// The data is collected per VF. 1406 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; 1407 1408 /// Holds the instructions (address computations) that are forced to be 1409 /// scalarized. 1410 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; 1411 1412 /// Returns the expected difference in cost from scalarizing the expression 1413 /// feeding a predicated instruction \p PredInst. The instructions to 1414 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1415 /// non-negative return value implies the expression will be scalarized. 1416 /// Currently, only single-use chains are considered for scalarization. 1417 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, 1418 unsigned VF); 1419 1420 /// Collect the instructions that are uniform after vectorization. An 1421 /// instruction is uniform if we represent it with a single scalar value in 1422 /// the vectorized loop corresponding to each vector iteration. Examples of 1423 /// uniform instructions include pointer operands of consecutive or 1424 /// interleaved memory accesses. Note that although uniformity implies an 1425 /// instruction will be scalar, the reverse is not true. In general, a 1426 /// scalarized instruction will be represented by VF scalar values in the 1427 /// vectorized loop, each corresponding to an iteration of the original 1428 /// scalar loop. 1429 void collectLoopUniforms(unsigned VF); 1430 1431 /// Collect the instructions that are scalar after vectorization. An 1432 /// instruction is scalar if it is known to be uniform or will be scalarized 1433 /// during vectorization. Non-uniform scalarized instructions will be 1434 /// represented by VF values in the vectorized loop, each corresponding to an 1435 /// iteration of the original scalar loop. 1436 void collectLoopScalars(unsigned VF); 1437 1438 /// Keeps cost model vectorization decision and cost for instructions. 1439 /// Right now it is used for memory instructions only. 1440 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>, 1441 std::pair<InstWidening, unsigned>>; 1442 1443 DecisionList WideningDecisions; 1444 1445 /// Returns true if \p V is expected to be vectorized and it needs to be 1446 /// extracted. 1447 bool needsExtract(Value *V, unsigned VF) const { 1448 Instruction *I = dyn_cast<Instruction>(V); 1449 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) 1450 return false; 1451 1452 // Assume we can vectorize V (and hence we need extraction) if the 1453 // scalars are not computed yet. This can happen, because it is called 1454 // via getScalarizationOverhead from setCostBasedWideningDecision, before 1455 // the scalars are collected. That should be a safe assumption in most 1456 // cases, because we check if the operands have vectorizable types 1457 // beforehand in LoopVectorizationLegality. 1458 return Scalars.find(VF) == Scalars.end() || 1459 !isScalarAfterVectorization(I, VF); 1460 }; 1461 1462 /// Returns a range containing only operands needing to be extracted. 1463 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, 1464 unsigned VF) { 1465 return SmallVector<Value *, 4>(make_filter_range( 1466 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); 1467 } 1468 1469 public: 1470 /// The loop that we evaluate. 1471 Loop *TheLoop; 1472 1473 /// Predicated scalar evolution analysis. 1474 PredicatedScalarEvolution &PSE; 1475 1476 /// Loop Info analysis. 1477 LoopInfo *LI; 1478 1479 /// Vectorization legality. 1480 LoopVectorizationLegality *Legal; 1481 1482 /// Vector target information. 1483 const TargetTransformInfo &TTI; 1484 1485 /// Target Library Info. 1486 const TargetLibraryInfo *TLI; 1487 1488 /// Demanded bits analysis. 1489 DemandedBits *DB; 1490 1491 /// Assumption cache. 1492 AssumptionCache *AC; 1493 1494 /// Interface to emit optimization remarks. 1495 OptimizationRemarkEmitter *ORE; 1496 1497 const Function *TheFunction; 1498 1499 /// Loop Vectorize Hint. 1500 const LoopVectorizeHints *Hints; 1501 1502 /// The interleave access information contains groups of interleaved accesses 1503 /// with the same stride and close to each other. 1504 InterleavedAccessInfo &InterleaveInfo; 1505 1506 /// Values to ignore in the cost model. 1507 SmallPtrSet<const Value *, 16> ValuesToIgnore; 1508 1509 /// Values to ignore in the cost model when VF > 1. 1510 SmallPtrSet<const Value *, 16> VecValuesToIgnore; 1511 }; 1512 1513 } // end namespace llvm 1514 1515 // Return true if \p OuterLp is an outer loop annotated with hints for explicit 1516 // vectorization. The loop needs to be annotated with #pragma omp simd 1517 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 1518 // vector length information is not provided, vectorization is not considered 1519 // explicit. Interleave hints are not allowed either. These limitations will be 1520 // relaxed in the future. 1521 // Please, note that we are currently forced to abuse the pragma 'clang 1522 // vectorize' semantics. This pragma provides *auto-vectorization hints* 1523 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 1524 // provides *explicit vectorization hints* (LV can bypass legal checks and 1525 // assume that vectorization is legal). However, both hints are implemented 1526 // using the same metadata (llvm.loop.vectorize, processed by 1527 // LoopVectorizeHints). This will be fixed in the future when the native IR 1528 // representation for pragma 'omp simd' is introduced. 1529 static bool isExplicitVecOuterLoop(Loop *OuterLp, 1530 OptimizationRemarkEmitter *ORE) { 1531 assert(!OuterLp->empty() && "This is not an outer loop"); 1532 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); 1533 1534 // Only outer loops with an explicit vectorization hint are supported. 1535 // Unannotated outer loops are ignored. 1536 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) 1537 return false; 1538 1539 Function *Fn = OuterLp->getHeader()->getParent(); 1540 if (!Hints.allowVectorization(Fn, OuterLp, 1541 true /*VectorizeOnlyWhenForced*/)) { 1542 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"); 1543 return false; 1544 } 1545 1546 if (Hints.getInterleave() > 1) { 1547 // TODO: Interleave support is future work. 1548 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " 1549 "outer loops.\n"); 1550 Hints.emitRemarkWithHints(); 1551 return false; 1552 } 1553 1554 return true; 1555 } 1556 1557 static void collectSupportedLoops(Loop &L, LoopInfo *LI, 1558 OptimizationRemarkEmitter *ORE, 1559 SmallVectorImpl<Loop *> &V) { 1560 // Collect inner loops and outer loops without irreducible control flow. For 1561 // now, only collect outer loops that have explicit vectorization hints. If we 1562 // are stress testing the VPlan H-CFG construction, we collect the outermost 1563 // loop of every loop nest. 1564 if (L.empty() || VPlanBuildStressTest || 1565 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) { 1566 LoopBlocksRPO RPOT(&L); 1567 RPOT.perform(LI); 1568 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) { 1569 V.push_back(&L); 1570 // TODO: Collect inner loops inside marked outer loops in case 1571 // vectorization fails for the outer loop. Do not invoke 1572 // 'containsIrreducibleCFG' again for inner loops when the outer loop is 1573 // already known to be reducible. We can use an inherited attribute for 1574 // that. 1575 return; 1576 } 1577 } 1578 for (Loop *InnerL : L) 1579 collectSupportedLoops(*InnerL, LI, ORE, V); 1580 } 1581 1582 namespace { 1583 1584 /// The LoopVectorize Pass. 1585 struct LoopVectorize : public FunctionPass { 1586 /// Pass identification, replacement for typeid 1587 static char ID; 1588 1589 LoopVectorizePass Impl; 1590 1591 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false, 1592 bool VectorizeOnlyWhenForced = false) 1593 : FunctionPass(ID) { 1594 Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced; 1595 Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced; 1596 initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); 1597 } 1598 1599 bool runOnFunction(Function &F) override { 1600 if (skipFunction(F)) 1601 return false; 1602 1603 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1604 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1605 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1606 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1607 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); 1608 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1609 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; 1610 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1611 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1612 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); 1613 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits(); 1614 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1615 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 1616 1617 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 1618 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; 1619 1620 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC, 1621 GetLAA, *ORE, PSI); 1622 } 1623 1624 void getAnalysisUsage(AnalysisUsage &AU) const override { 1625 AU.addRequired<AssumptionCacheTracker>(); 1626 AU.addRequired<BlockFrequencyInfoWrapperPass>(); 1627 AU.addRequired<DominatorTreeWrapperPass>(); 1628 AU.addRequired<LoopInfoWrapperPass>(); 1629 AU.addRequired<ScalarEvolutionWrapperPass>(); 1630 AU.addRequired<TargetTransformInfoWrapperPass>(); 1631 AU.addRequired<AAResultsWrapperPass>(); 1632 AU.addRequired<LoopAccessLegacyAnalysis>(); 1633 AU.addRequired<DemandedBitsWrapperPass>(); 1634 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1635 AU.addRequired<InjectTLIMappingsLegacy>(); 1636 1637 // We currently do not preserve loopinfo/dominator analyses with outer loop 1638 // vectorization. Until this is addressed, mark these analyses as preserved 1639 // only for non-VPlan-native path. 1640 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 1641 if (!EnableVPlanNativePath) { 1642 AU.addPreserved<LoopInfoWrapperPass>(); 1643 AU.addPreserved<DominatorTreeWrapperPass>(); 1644 } 1645 1646 AU.addPreserved<BasicAAWrapperPass>(); 1647 AU.addPreserved<GlobalsAAWrapperPass>(); 1648 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 1649 } 1650 }; 1651 1652 } // end anonymous namespace 1653 1654 //===----------------------------------------------------------------------===// 1655 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 1656 // LoopVectorizationCostModel and LoopVectorizationPlanner. 1657 //===----------------------------------------------------------------------===// 1658 1659 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { 1660 // We need to place the broadcast of invariant variables outside the loop, 1661 // but only if it's proven safe to do so. Else, broadcast will be inside 1662 // vector loop body. 1663 Instruction *Instr = dyn_cast<Instruction>(V); 1664 bool SafeToHoist = OrigLoop->isLoopInvariant(V) && 1665 (!Instr || 1666 DT->dominates(Instr->getParent(), LoopVectorPreHeader)); 1667 // Place the code for broadcasting invariant variables in the new preheader. 1668 IRBuilder<>::InsertPointGuard Guard(Builder); 1669 if (SafeToHoist) 1670 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1671 1672 // Broadcast the scalar into all locations in the vector. 1673 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); 1674 1675 return Shuf; 1676 } 1677 1678 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( 1679 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) { 1680 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1681 "Expected either an induction phi-node or a truncate of it!"); 1682 Value *Start = II.getStartValue(); 1683 1684 // Construct the initial value of the vector IV in the vector loop preheader 1685 auto CurrIP = Builder.saveIP(); 1686 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 1687 if (isa<TruncInst>(EntryVal)) { 1688 assert(Start->getType()->isIntegerTy() && 1689 "Truncation requires an integer type"); 1690 auto *TruncType = cast<IntegerType>(EntryVal->getType()); 1691 Step = Builder.CreateTrunc(Step, TruncType); 1692 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType); 1693 } 1694 Value *SplatStart = Builder.CreateVectorSplat(VF, Start); 1695 Value *SteppedStart = 1696 getStepVector(SplatStart, 0, Step, II.getInductionOpcode()); 1697 1698 // We create vector phi nodes for both integer and floating-point induction 1699 // variables. Here, we determine the kind of arithmetic we will perform. 1700 Instruction::BinaryOps AddOp; 1701 Instruction::BinaryOps MulOp; 1702 if (Step->getType()->isIntegerTy()) { 1703 AddOp = Instruction::Add; 1704 MulOp = Instruction::Mul; 1705 } else { 1706 AddOp = II.getInductionOpcode(); 1707 MulOp = Instruction::FMul; 1708 } 1709 1710 // Multiply the vectorization factor by the step using integer or 1711 // floating-point arithmetic as appropriate. 1712 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF); 1713 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); 1714 1715 // Create a vector splat to use in the induction update. 1716 // 1717 // FIXME: If the step is non-constant, we create the vector splat with 1718 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't 1719 // handle a constant vector splat. 1720 Value *SplatVF = 1721 isa<Constant>(Mul) 1722 ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul)) 1723 : Builder.CreateVectorSplat(VF, Mul); 1724 Builder.restoreIP(CurrIP); 1725 1726 // We may need to add the step a number of times, depending on the unroll 1727 // factor. The last of those goes into the PHI. 1728 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind", 1729 &*LoopVectorBody->getFirstInsertionPt()); 1730 VecInd->setDebugLoc(EntryVal->getDebugLoc()); 1731 Instruction *LastInduction = VecInd; 1732 for (unsigned Part = 0; Part < UF; ++Part) { 1733 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); 1734 1735 if (isa<TruncInst>(EntryVal)) 1736 addMetadata(LastInduction, EntryVal); 1737 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part); 1738 1739 LastInduction = cast<Instruction>(addFastMathFlag( 1740 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"))); 1741 LastInduction->setDebugLoc(EntryVal->getDebugLoc()); 1742 } 1743 1744 // Move the last step to the end of the latch block. This ensures consistent 1745 // placement of all induction updates. 1746 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); 1747 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator()); 1748 auto *ICmp = cast<Instruction>(Br->getCondition()); 1749 LastInduction->moveBefore(ICmp); 1750 LastInduction->setName("vec.ind.next"); 1751 1752 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader); 1753 VecInd->addIncoming(LastInduction, LoopVectorLatch); 1754 } 1755 1756 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const { 1757 return Cost->isScalarAfterVectorization(I, VF) || 1758 Cost->isProfitableToScalarize(I, VF); 1759 } 1760 1761 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { 1762 if (shouldScalarizeInstruction(IV)) 1763 return true; 1764 auto isScalarInst = [&](User *U) -> bool { 1765 auto *I = cast<Instruction>(U); 1766 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I)); 1767 }; 1768 return llvm::any_of(IV->users(), isScalarInst); 1769 } 1770 1771 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( 1772 const InductionDescriptor &ID, const Instruction *EntryVal, 1773 Value *VectorLoopVal, unsigned Part, unsigned Lane) { 1774 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && 1775 "Expected either an induction phi-node or a truncate of it!"); 1776 1777 // This induction variable is not the phi from the original loop but the 1778 // newly-created IV based on the proof that casted Phi is equal to the 1779 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It 1780 // re-uses the same InductionDescriptor that original IV uses but we don't 1781 // have to do any recording in this case - that is done when original IV is 1782 // processed. 1783 if (isa<TruncInst>(EntryVal)) 1784 return; 1785 1786 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); 1787 if (Casts.empty()) 1788 return; 1789 // Only the first Cast instruction in the Casts vector is of interest. 1790 // The rest of the Casts (if exist) have no uses outside the 1791 // induction update chain itself. 1792 Instruction *CastInst = *Casts.begin(); 1793 if (Lane < UINT_MAX) 1794 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); 1795 else 1796 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); 1797 } 1798 1799 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { 1800 assert((IV->getType()->isIntegerTy() || IV != OldInduction) && 1801 "Primary induction variable must have an integer type"); 1802 1803 auto II = Legal->getInductionVars().find(IV); 1804 assert(II != Legal->getInductionVars().end() && "IV is not an induction"); 1805 1806 auto ID = II->second; 1807 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); 1808 1809 // The value from the original loop to which we are mapping the new induction 1810 // variable. 1811 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; 1812 1813 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 1814 1815 // Generate code for the induction step. Note that induction steps are 1816 // required to be loop-invariant 1817 auto CreateStepValue = [&](const SCEV *Step) -> Value * { 1818 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && 1819 "Induction step should be loop invariant"); 1820 if (PSE.getSE()->isSCEVable(IV->getType())) { 1821 SCEVExpander Exp(*PSE.getSE(), DL, "induction"); 1822 return Exp.expandCodeFor(Step, Step->getType(), 1823 LoopVectorPreHeader->getTerminator()); 1824 } 1825 return cast<SCEVUnknown>(Step)->getValue(); 1826 }; 1827 1828 // The scalar value to broadcast. This is derived from the canonical 1829 // induction variable. If a truncation type is given, truncate the canonical 1830 // induction variable and step. Otherwise, derive these values from the 1831 // induction descriptor. 1832 auto CreateScalarIV = [&](Value *&Step) -> Value * { 1833 Value *ScalarIV = Induction; 1834 if (IV != OldInduction) { 1835 ScalarIV = IV->getType()->isIntegerTy() 1836 ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) 1837 : Builder.CreateCast(Instruction::SIToFP, Induction, 1838 IV->getType()); 1839 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); 1840 ScalarIV->setName("offset.idx"); 1841 } 1842 if (Trunc) { 1843 auto *TruncType = cast<IntegerType>(Trunc->getType()); 1844 assert(Step->getType()->isIntegerTy() && 1845 "Truncation requires an integer step"); 1846 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType); 1847 Step = Builder.CreateTrunc(Step, TruncType); 1848 } 1849 return ScalarIV; 1850 }; 1851 1852 // Create the vector values from the scalar IV, in the absence of creating a 1853 // vector IV. 1854 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { 1855 Value *Broadcasted = getBroadcastInstrs(ScalarIV); 1856 for (unsigned Part = 0; Part < UF; ++Part) { 1857 Value *EntryPart = 1858 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); 1859 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); 1860 if (Trunc) 1861 addMetadata(EntryPart, Trunc); 1862 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part); 1863 } 1864 }; 1865 1866 // Now do the actual transformations, and start with creating the step value. 1867 Value *Step = CreateStepValue(ID.getStep()); 1868 if (VF <= 1) { 1869 Value *ScalarIV = CreateScalarIV(Step); 1870 CreateSplatIV(ScalarIV, Step); 1871 return; 1872 } 1873 1874 // Determine if we want a scalar version of the induction variable. This is 1875 // true if the induction variable itself is not widened, or if it has at 1876 // least one user in the loop that is not widened. 1877 auto NeedsScalarIV = needsScalarInduction(EntryVal); 1878 if (!NeedsScalarIV) { 1879 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1880 return; 1881 } 1882 1883 // Try to create a new independent vector induction variable. If we can't 1884 // create the phi node, we will splat the scalar induction variable in each 1885 // loop iteration. 1886 if (!shouldScalarizeInstruction(EntryVal)) { 1887 createVectorIntOrFpInductionPHI(ID, Step, EntryVal); 1888 Value *ScalarIV = CreateScalarIV(Step); 1889 // Create scalar steps that can be used by instructions we will later 1890 // scalarize. Note that the addition of the scalar steps will not increase 1891 // the number of instructions in the loop in the common case prior to 1892 // InstCombine. We will be trading one vector extract for each scalar step. 1893 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1894 return; 1895 } 1896 1897 // If we haven't yet vectorized the induction variable, splat the scalar 1898 // induction variable, and build the necessary step vectors. 1899 // TODO: Don't do it unless the vectorized IV is really required. 1900 Value *ScalarIV = CreateScalarIV(Step); 1901 CreateSplatIV(ScalarIV, Step); 1902 buildScalarSteps(ScalarIV, Step, EntryVal, ID); 1903 } 1904 1905 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step, 1906 Instruction::BinaryOps BinOp) { 1907 // Create and check the types. 1908 assert(Val->getType()->isVectorTy() && "Must be a vector"); 1909 int VLen = Val->getType()->getVectorNumElements(); 1910 1911 Type *STy = Val->getType()->getScalarType(); 1912 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && 1913 "Induction Step must be an integer or FP"); 1914 assert(Step->getType() == STy && "Step has wrong type"); 1915 1916 SmallVector<Constant *, 8> Indices; 1917 1918 if (STy->isIntegerTy()) { 1919 // Create a vector of consecutive numbers from zero to VF. 1920 for (int i = 0; i < VLen; ++i) 1921 Indices.push_back(ConstantInt::get(STy, StartIdx + i)); 1922 1923 // Add the consecutive indices to the vector value. 1924 Constant *Cv = ConstantVector::get(Indices); 1925 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); 1926 Step = Builder.CreateVectorSplat(VLen, Step); 1927 assert(Step->getType() == Val->getType() && "Invalid step vec"); 1928 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 1929 // which can be found from the original scalar operations. 1930 Step = Builder.CreateMul(Cv, Step); 1931 return Builder.CreateAdd(Val, Step, "induction"); 1932 } 1933 1934 // Floating point induction. 1935 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && 1936 "Binary Opcode should be specified for FP induction"); 1937 // Create a vector of consecutive numbers from zero to VF. 1938 for (int i = 0; i < VLen; ++i) 1939 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i))); 1940 1941 // Add the consecutive indices to the vector value. 1942 Constant *Cv = ConstantVector::get(Indices); 1943 1944 Step = Builder.CreateVectorSplat(VLen, Step); 1945 1946 // Floating point operations had to be 'fast' to enable the induction. 1947 FastMathFlags Flags; 1948 Flags.setFast(); 1949 1950 Value *MulOp = Builder.CreateFMul(Cv, Step); 1951 if (isa<Instruction>(MulOp)) 1952 // Have to check, MulOp may be a constant 1953 cast<Instruction>(MulOp)->setFastMathFlags(Flags); 1954 1955 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction"); 1956 if (isa<Instruction>(BOp)) 1957 cast<Instruction>(BOp)->setFastMathFlags(Flags); 1958 return BOp; 1959 } 1960 1961 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, 1962 Instruction *EntryVal, 1963 const InductionDescriptor &ID) { 1964 // We shouldn't have to build scalar steps if we aren't vectorizing. 1965 assert(VF > 1 && "VF should be greater than one"); 1966 1967 // Get the value type and ensure it and the step have the same integer type. 1968 Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); 1969 assert(ScalarIVTy == Step->getType() && 1970 "Val and Step should have the same type"); 1971 1972 // We build scalar steps for both integer and floating-point induction 1973 // variables. Here, we determine the kind of arithmetic we will perform. 1974 Instruction::BinaryOps AddOp; 1975 Instruction::BinaryOps MulOp; 1976 if (ScalarIVTy->isIntegerTy()) { 1977 AddOp = Instruction::Add; 1978 MulOp = Instruction::Mul; 1979 } else { 1980 AddOp = ID.getInductionOpcode(); 1981 MulOp = Instruction::FMul; 1982 } 1983 1984 // Determine the number of scalars we need to generate for each unroll 1985 // iteration. If EntryVal is uniform, we only need to generate the first 1986 // lane. Otherwise, we generate all VF values. 1987 unsigned Lanes = 1988 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 1989 : VF; 1990 // Compute the scalar steps and save the results in VectorLoopValueMap. 1991 for (unsigned Part = 0; Part < UF; ++Part) { 1992 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 1993 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); 1994 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); 1995 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); 1996 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); 1997 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane); 1998 } 1999 } 2000 } 2001 2002 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { 2003 assert(V != Induction && "The new induction variable should not be used."); 2004 assert(!V->getType()->isVectorTy() && "Can't widen a vector"); 2005 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2006 2007 // If we have a stride that is replaced by one, do it here. Defer this for 2008 // the VPlan-native path until we start running Legal checks in that path. 2009 if (!EnableVPlanNativePath && Legal->hasStride(V)) 2010 V = ConstantInt::get(V->getType(), 1); 2011 2012 // If we have a vector mapped to this value, return it. 2013 if (VectorLoopValueMap.hasVectorValue(V, Part)) 2014 return VectorLoopValueMap.getVectorValue(V, Part); 2015 2016 // If the value has not been vectorized, check if it has been scalarized 2017 // instead. If it has been scalarized, and we actually need the value in 2018 // vector form, we will construct the vector values on demand. 2019 if (VectorLoopValueMap.hasAnyScalarValue(V)) { 2020 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); 2021 2022 // If we've scalarized a value, that value should be an instruction. 2023 auto *I = cast<Instruction>(V); 2024 2025 // If we aren't vectorizing, we can just copy the scalar map values over to 2026 // the vector map. 2027 if (VF == 1) { 2028 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue); 2029 return ScalarValue; 2030 } 2031 2032 // Get the last scalar instruction we generated for V and Part. If the value 2033 // is known to be uniform after vectorization, this corresponds to lane zero 2034 // of the Part unroll iteration. Otherwise, the last instruction is the one 2035 // we created for the last vector lane of the Part unroll iteration. 2036 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; 2037 auto *LastInst = cast<Instruction>( 2038 VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); 2039 2040 // Set the insert point after the last scalarized instruction. This ensures 2041 // the insertelement sequence will directly follow the scalar definitions. 2042 auto OldIP = Builder.saveIP(); 2043 auto NewIP = std::next(BasicBlock::iterator(LastInst)); 2044 Builder.SetInsertPoint(&*NewIP); 2045 2046 // However, if we are vectorizing, we need to construct the vector values. 2047 // If the value is known to be uniform after vectorization, we can just 2048 // broadcast the scalar value corresponding to lane zero for each unroll 2049 // iteration. Otherwise, we construct the vector values using insertelement 2050 // instructions. Since the resulting vectors are stored in 2051 // VectorLoopValueMap, we will only generate the insertelements once. 2052 Value *VectorValue = nullptr; 2053 if (Cost->isUniformAfterVectorization(I, VF)) { 2054 VectorValue = getBroadcastInstrs(ScalarValue); 2055 VectorLoopValueMap.setVectorValue(V, Part, VectorValue); 2056 } else { 2057 // Initialize packing with insertelements to start from undef. 2058 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); 2059 VectorLoopValueMap.setVectorValue(V, Part, Undef); 2060 for (unsigned Lane = 0; Lane < VF; ++Lane) 2061 packScalarIntoVectorValue(V, {Part, Lane}); 2062 VectorValue = VectorLoopValueMap.getVectorValue(V, Part); 2063 } 2064 Builder.restoreIP(OldIP); 2065 return VectorValue; 2066 } 2067 2068 // If this scalar is unknown, assume that it is a constant or that it is 2069 // loop invariant. Broadcast V and save the value for future uses. 2070 Value *B = getBroadcastInstrs(V); 2071 VectorLoopValueMap.setVectorValue(V, Part, B); 2072 return B; 2073 } 2074 2075 Value * 2076 InnerLoopVectorizer::getOrCreateScalarValue(Value *V, 2077 const VPIteration &Instance) { 2078 // If the value is not an instruction contained in the loop, it should 2079 // already be scalar. 2080 if (OrigLoop->isLoopInvariant(V)) 2081 return V; 2082 2083 assert(Instance.Lane > 0 2084 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) 2085 : true && "Uniform values only have lane zero"); 2086 2087 // If the value from the original loop has not been vectorized, it is 2088 // represented by UF x VF scalar values in the new loop. Return the requested 2089 // scalar value. 2090 if (VectorLoopValueMap.hasScalarValue(V, Instance)) 2091 return VectorLoopValueMap.getScalarValue(V, Instance); 2092 2093 // If the value has not been scalarized, get its entry in VectorLoopValueMap 2094 // for the given unroll part. If this entry is not a vector type (i.e., the 2095 // vectorization factor is one), there is no need to generate an 2096 // extractelement instruction. 2097 auto *U = getOrCreateVectorValue(V, Instance.Part); 2098 if (!U->getType()->isVectorTy()) { 2099 assert(VF == 1 && "Value not scalarized has non-vector type"); 2100 return U; 2101 } 2102 2103 // Otherwise, the value from the original loop has been vectorized and is 2104 // represented by UF vector values. Extract and return the requested scalar 2105 // value from the appropriate vector lane. 2106 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); 2107 } 2108 2109 void InnerLoopVectorizer::packScalarIntoVectorValue( 2110 Value *V, const VPIteration &Instance) { 2111 assert(V != Induction && "The new induction variable should not be used."); 2112 assert(!V->getType()->isVectorTy() && "Can't pack a vector"); 2113 assert(!V->getType()->isVoidTy() && "Type does not produce a value"); 2114 2115 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); 2116 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); 2117 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, 2118 Builder.getInt32(Instance.Lane)); 2119 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); 2120 } 2121 2122 Value *InnerLoopVectorizer::reverseVector(Value *Vec) { 2123 assert(Vec->getType()->isVectorTy() && "Invalid type"); 2124 SmallVector<Constant *, 8> ShuffleMask; 2125 for (unsigned i = 0; i < VF; ++i) 2126 ShuffleMask.push_back(Builder.getInt32(VF - i - 1)); 2127 2128 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), 2129 ConstantVector::get(ShuffleMask), 2130 "reverse"); 2131 } 2132 2133 // Return whether we allow using masked interleave-groups (for dealing with 2134 // strided loads/stores that reside in predicated blocks, or for dealing 2135 // with gaps). 2136 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { 2137 // If an override option has been passed in for interleaved accesses, use it. 2138 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) 2139 return EnableMaskedInterleavedMemAccesses; 2140 2141 return TTI.enableMaskedInterleavedAccessVectorization(); 2142 } 2143 2144 // Try to vectorize the interleave group that \p Instr belongs to. 2145 // 2146 // E.g. Translate following interleaved load group (factor = 3): 2147 // for (i = 0; i < N; i+=3) { 2148 // R = Pic[i]; // Member of index 0 2149 // G = Pic[i+1]; // Member of index 1 2150 // B = Pic[i+2]; // Member of index 2 2151 // ... // do something to R, G, B 2152 // } 2153 // To: 2154 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B 2155 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements 2156 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements 2157 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements 2158 // 2159 // Or translate following interleaved store group (factor = 3): 2160 // for (i = 0; i < N; i+=3) { 2161 // ... do something to R, G, B 2162 // Pic[i] = R; // Member of index 0 2163 // Pic[i+1] = G; // Member of index 1 2164 // Pic[i+2] = B; // Member of index 2 2165 // } 2166 // To: 2167 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> 2168 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> 2169 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, 2170 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements 2171 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B 2172 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, 2173 VPTransformState &State, 2174 VPValue *Addr, 2175 VPValue *BlockInMask) { 2176 const InterleaveGroup<Instruction> *Group = 2177 Cost->getInterleavedAccessGroup(Instr); 2178 assert(Group && "Fail to get an interleaved access group."); 2179 2180 // Skip if current instruction is not the insert position. 2181 if (Instr != Group->getInsertPos()) 2182 return; 2183 2184 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2185 2186 // Prepare for the vector type of the interleaved load/store. 2187 Type *ScalarTy = getMemInstValueType(Instr); 2188 unsigned InterleaveFactor = Group->getFactor(); 2189 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); 2190 2191 // Prepare for the new pointers. 2192 SmallVector<Value *, 2> AddrParts; 2193 unsigned Index = Group->getIndex(Instr); 2194 2195 // TODO: extend the masked interleaved-group support to reversed access. 2196 assert((!BlockInMask || !Group->isReverse()) && 2197 "Reversed masked interleave-group not supported."); 2198 2199 // If the group is reverse, adjust the index to refer to the last vector lane 2200 // instead of the first. We adjust the index from the first vector lane, 2201 // rather than directly getting the pointer for lane VF - 1, because the 2202 // pointer operand of the interleaved access is supposed to be uniform. For 2203 // uniform instructions, we're only required to generate a value for the 2204 // first vector lane in each unroll iteration. 2205 if (Group->isReverse()) 2206 Index += (VF - 1) * Group->getFactor(); 2207 2208 for (unsigned Part = 0; Part < UF; Part++) { 2209 Value *AddrPart = State.get(Addr, {Part, 0}); 2210 setDebugLocFromInst(Builder, AddrPart); 2211 2212 // Notice current instruction could be any index. Need to adjust the address 2213 // to the member of index 0. 2214 // 2215 // E.g. a = A[i+1]; // Member of index 1 (Current instruction) 2216 // b = A[i]; // Member of index 0 2217 // Current pointer is pointed to A[i+1], adjust it to A[i]. 2218 // 2219 // E.g. A[i+1] = a; // Member of index 1 2220 // A[i] = b; // Member of index 0 2221 // A[i+2] = c; // Member of index 2 (Current instruction) 2222 // Current pointer is pointed to A[i+2], adjust it to A[i]. 2223 2224 bool InBounds = false; 2225 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) 2226 InBounds = gep->isInBounds(); 2227 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); 2228 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); 2229 2230 // Cast to the vector pointer type. 2231 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); 2232 Type *PtrTy = VecTy->getPointerTo(AddressSpace); 2233 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); 2234 } 2235 2236 setDebugLocFromInst(Builder, Instr); 2237 Value *UndefVec = UndefValue::get(VecTy); 2238 2239 Value *MaskForGaps = nullptr; 2240 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { 2241 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); 2242 assert(MaskForGaps && "Mask for Gaps is required but it is null"); 2243 } 2244 2245 // Vectorize the interleaved load group. 2246 if (isa<LoadInst>(Instr)) { 2247 // For each unroll part, create a wide load for the group. 2248 SmallVector<Value *, 2> NewLoads; 2249 for (unsigned Part = 0; Part < UF; Part++) { 2250 Instruction *NewLoad; 2251 if (BlockInMask || MaskForGaps) { 2252 assert(useMaskedInterleavedAccesses(*TTI) && 2253 "masked interleaved groups are not allowed."); 2254 Value *GroupMask = MaskForGaps; 2255 if (BlockInMask) { 2256 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2257 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2258 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2259 Value *ShuffledMask = Builder.CreateShuffleVector( 2260 BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); 2261 GroupMask = MaskForGaps 2262 ? Builder.CreateBinOp(Instruction::And, ShuffledMask, 2263 MaskForGaps) 2264 : ShuffledMask; 2265 } 2266 NewLoad = 2267 Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(), 2268 GroupMask, UndefVec, "wide.masked.vec"); 2269 } 2270 else 2271 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], 2272 Group->getAlign(), "wide.vec"); 2273 Group->addMetadata(NewLoad); 2274 NewLoads.push_back(NewLoad); 2275 } 2276 2277 // For each member in the group, shuffle out the appropriate data from the 2278 // wide loads. 2279 for (unsigned I = 0; I < InterleaveFactor; ++I) { 2280 Instruction *Member = Group->getMember(I); 2281 2282 // Skip the gaps in the group. 2283 if (!Member) 2284 continue; 2285 2286 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF); 2287 for (unsigned Part = 0; Part < UF; Part++) { 2288 Value *StridedVec = Builder.CreateShuffleVector( 2289 NewLoads[Part], UndefVec, StrideMask, "strided.vec"); 2290 2291 // If this member has different type, cast the result type. 2292 if (Member->getType() != ScalarTy) { 2293 VectorType *OtherVTy = VectorType::get(Member->getType(), VF); 2294 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); 2295 } 2296 2297 if (Group->isReverse()) 2298 StridedVec = reverseVector(StridedVec); 2299 2300 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec); 2301 } 2302 } 2303 return; 2304 } 2305 2306 // The sub vector type for current instruction. 2307 VectorType *SubVT = VectorType::get(ScalarTy, VF); 2308 2309 // Vectorize the interleaved store group. 2310 for (unsigned Part = 0; Part < UF; Part++) { 2311 // Collect the stored vector from each member. 2312 SmallVector<Value *, 4> StoredVecs; 2313 for (unsigned i = 0; i < InterleaveFactor; i++) { 2314 // Interleaved store group doesn't allow a gap, so each index has a member 2315 Instruction *Member = Group->getMember(i); 2316 assert(Member && "Fail to get a member from an interleaved store group"); 2317 2318 Value *StoredVec = getOrCreateVectorValue( 2319 cast<StoreInst>(Member)->getValueOperand(), Part); 2320 if (Group->isReverse()) 2321 StoredVec = reverseVector(StoredVec); 2322 2323 // If this member has different type, cast it to a unified type. 2324 2325 if (StoredVec->getType() != SubVT) 2326 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL); 2327 2328 StoredVecs.push_back(StoredVec); 2329 } 2330 2331 // Concatenate all vectors into a wide vector. 2332 Value *WideVec = concatenateVectors(Builder, StoredVecs); 2333 2334 // Interleave the elements in the wide vector. 2335 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor); 2336 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, 2337 "interleaved.vec"); 2338 2339 Instruction *NewStoreInstr; 2340 if (BlockInMask) { 2341 Value *BlockInMaskPart = State.get(BlockInMask, Part); 2342 auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); 2343 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); 2344 Value *ShuffledMask = Builder.CreateShuffleVector( 2345 BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); 2346 NewStoreInstr = Builder.CreateMaskedStore( 2347 IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); 2348 } 2349 else 2350 NewStoreInstr = 2351 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); 2352 2353 Group->addMetadata(NewStoreInstr); 2354 } 2355 } 2356 2357 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, 2358 VPTransformState &State, 2359 VPValue *Addr, 2360 VPValue *StoredValue, 2361 VPValue *BlockInMask) { 2362 // Attempt to issue a wide load. 2363 LoadInst *LI = dyn_cast<LoadInst>(Instr); 2364 StoreInst *SI = dyn_cast<StoreInst>(Instr); 2365 2366 assert((LI || SI) && "Invalid Load/Store instruction"); 2367 assert((!SI || StoredValue) && "No stored value provided for widened store"); 2368 assert((!LI || !StoredValue) && "Stored value provided for widened load"); 2369 2370 LoopVectorizationCostModel::InstWidening Decision = 2371 Cost->getWideningDecision(Instr, VF); 2372 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 2373 "CM decision should be taken at this point"); 2374 if (Decision == LoopVectorizationCostModel::CM_Interleave) 2375 return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask); 2376 2377 Type *ScalarDataTy = getMemInstValueType(Instr); 2378 Type *DataTy = VectorType::get(ScalarDataTy, VF); 2379 // An alignment of 0 means target abi alignment. We need to use the scalar's 2380 // target abi alignment in such a case. 2381 const DataLayout &DL = Instr->getModule()->getDataLayout(); 2382 const Align Alignment = 2383 DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); 2384 2385 // Determine if the pointer operand of the access is either consecutive or 2386 // reverse consecutive. 2387 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); 2388 bool ConsecutiveStride = 2389 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); 2390 bool CreateGatherScatter = 2391 (Decision == LoopVectorizationCostModel::CM_GatherScatter); 2392 2393 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector 2394 // gather/scatter. Otherwise Decision should have been to Scalarize. 2395 assert((ConsecutiveStride || CreateGatherScatter) && 2396 "The instruction should be scalarized"); 2397 (void)ConsecutiveStride; 2398 2399 VectorParts BlockInMaskParts(UF); 2400 bool isMaskRequired = BlockInMask; 2401 if (isMaskRequired) 2402 for (unsigned Part = 0; Part < UF; ++Part) 2403 BlockInMaskParts[Part] = State.get(BlockInMask, Part); 2404 2405 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { 2406 // Calculate the pointer for the specific unroll-part. 2407 GetElementPtrInst *PartPtr = nullptr; 2408 2409 bool InBounds = false; 2410 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) 2411 InBounds = gep->isInBounds(); 2412 2413 if (Reverse) { 2414 // If the address is consecutive but reversed, then the 2415 // wide store needs to start at the last vector element. 2416 PartPtr = cast<GetElementPtrInst>( 2417 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF))); 2418 PartPtr->setIsInBounds(InBounds); 2419 PartPtr = cast<GetElementPtrInst>( 2420 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); 2421 PartPtr->setIsInBounds(InBounds); 2422 if (isMaskRequired) // Reverse of a null all-one mask is a null mask. 2423 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); 2424 } else { 2425 PartPtr = cast<GetElementPtrInst>( 2426 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); 2427 PartPtr->setIsInBounds(InBounds); 2428 } 2429 2430 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); 2431 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); 2432 }; 2433 2434 // Handle Stores: 2435 if (SI) { 2436 setDebugLocFromInst(Builder, SI); 2437 2438 for (unsigned Part = 0; Part < UF; ++Part) { 2439 Instruction *NewSI = nullptr; 2440 Value *StoredVal = State.get(StoredValue, Part); 2441 if (CreateGatherScatter) { 2442 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2443 Value *VectorGep = State.get(Addr, Part); 2444 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, 2445 MaskPart); 2446 } else { 2447 if (Reverse) { 2448 // If we store to reverse consecutive memory locations, then we need 2449 // to reverse the order of elements in the stored value. 2450 StoredVal = reverseVector(StoredVal); 2451 // We don't want to update the value in the map as it might be used in 2452 // another expression. So don't call resetVectorValue(StoredVal). 2453 } 2454 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2455 if (isMaskRequired) 2456 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, 2457 BlockInMaskParts[Part]); 2458 else 2459 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); 2460 } 2461 addMetadata(NewSI, SI); 2462 } 2463 return; 2464 } 2465 2466 // Handle loads. 2467 assert(LI && "Must have a load instruction"); 2468 setDebugLocFromInst(Builder, LI); 2469 for (unsigned Part = 0; Part < UF; ++Part) { 2470 Value *NewLI; 2471 if (CreateGatherScatter) { 2472 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; 2473 Value *VectorGep = State.get(Addr, Part); 2474 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, 2475 nullptr, "wide.masked.gather"); 2476 addMetadata(NewLI, LI); 2477 } else { 2478 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); 2479 if (isMaskRequired) 2480 NewLI = Builder.CreateMaskedLoad( 2481 VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy), 2482 "wide.masked.load"); 2483 else 2484 NewLI = 2485 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); 2486 2487 // Add metadata to the load, but setVectorValue to the reverse shuffle. 2488 addMetadata(NewLI, LI); 2489 if (Reverse) 2490 NewLI = reverseVector(NewLI); 2491 } 2492 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI); 2493 } 2494 } 2495 2496 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, 2497 const VPIteration &Instance, 2498 bool IfPredicateInstr) { 2499 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); 2500 2501 setDebugLocFromInst(Builder, Instr); 2502 2503 // Does this instruction return a value ? 2504 bool IsVoidRetTy = Instr->getType()->isVoidTy(); 2505 2506 Instruction *Cloned = Instr->clone(); 2507 if (!IsVoidRetTy) 2508 Cloned->setName(Instr->getName() + ".cloned"); 2509 2510 // Replace the operands of the cloned instructions with their scalar 2511 // equivalents in the new loop. 2512 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { 2513 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); 2514 Cloned->setOperand(op, NewOp); 2515 } 2516 addNewMetadata(Cloned, Instr); 2517 2518 // Place the cloned scalar in the new loop. 2519 Builder.Insert(Cloned); 2520 2521 // Add the cloned scalar to the scalar map entry. 2522 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); 2523 2524 // If we just cloned a new assumption, add it the assumption cache. 2525 if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) 2526 if (II->getIntrinsicID() == Intrinsic::assume) 2527 AC->registerAssumption(II); 2528 2529 // End if-block. 2530 if (IfPredicateInstr) 2531 PredicatedInstructions.push_back(Cloned); 2532 } 2533 2534 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, 2535 Value *End, Value *Step, 2536 Instruction *DL) { 2537 BasicBlock *Header = L->getHeader(); 2538 BasicBlock *Latch = L->getLoopLatch(); 2539 // As we're just creating this loop, it's possible no latch exists 2540 // yet. If so, use the header as this will be a single block loop. 2541 if (!Latch) 2542 Latch = Header; 2543 2544 IRBuilder<> Builder(&*Header->getFirstInsertionPt()); 2545 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction); 2546 setDebugLocFromInst(Builder, OldInst); 2547 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); 2548 2549 Builder.SetInsertPoint(Latch->getTerminator()); 2550 setDebugLocFromInst(Builder, OldInst); 2551 2552 // Create i+1 and fill the PHINode. 2553 Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); 2554 Induction->addIncoming(Start, L->getLoopPreheader()); 2555 Induction->addIncoming(Next, Latch); 2556 // Create the compare. 2557 Value *ICmp = Builder.CreateICmpEQ(Next, End); 2558 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); 2559 2560 // Now we have two terminators. Remove the old one from the block. 2561 Latch->getTerminator()->eraseFromParent(); 2562 2563 return Induction; 2564 } 2565 2566 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { 2567 if (TripCount) 2568 return TripCount; 2569 2570 assert(L && "Create Trip Count for null loop."); 2571 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2572 // Find the loop boundaries. 2573 ScalarEvolution *SE = PSE.getSE(); 2574 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); 2575 assert(BackedgeTakenCount != SE->getCouldNotCompute() && 2576 "Invalid loop count"); 2577 2578 Type *IdxTy = Legal->getWidestInductionType(); 2579 assert(IdxTy && "No type for induction"); 2580 2581 // The exit count might have the type of i64 while the phi is i32. This can 2582 // happen if we have an induction variable that is sign extended before the 2583 // compare. The only way that we get a backedge taken count is that the 2584 // induction variable was signed and as such will not overflow. In such a case 2585 // truncation is legal. 2586 if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > 2587 IdxTy->getPrimitiveSizeInBits()) 2588 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); 2589 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); 2590 2591 // Get the total trip count from the count by adding 1. 2592 const SCEV *ExitCount = SE->getAddExpr( 2593 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); 2594 2595 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); 2596 2597 // Expand the trip count and place the new instructions in the preheader. 2598 // Notice that the pre-header does not change, only the loop body. 2599 SCEVExpander Exp(*SE, DL, "induction"); 2600 2601 // Count holds the overall loop count (N). 2602 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), 2603 L->getLoopPreheader()->getTerminator()); 2604 2605 if (TripCount->getType()->isPointerTy()) 2606 TripCount = 2607 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int", 2608 L->getLoopPreheader()->getTerminator()); 2609 2610 return TripCount; 2611 } 2612 2613 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { 2614 if (VectorTripCount) 2615 return VectorTripCount; 2616 2617 Value *TC = getOrCreateTripCount(L); 2618 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); 2619 2620 Type *Ty = TC->getType(); 2621 Constant *Step = ConstantInt::get(Ty, VF * UF); 2622 2623 // If the tail is to be folded by masking, round the number of iterations N 2624 // up to a multiple of Step instead of rounding down. This is done by first 2625 // adding Step-1 and then rounding down. Note that it's ok if this addition 2626 // overflows: the vector induction variable will eventually wrap to zero given 2627 // that it starts at zero and its Step is a power of two; the loop will then 2628 // exit, with the last early-exit vector comparison also producing all-true. 2629 if (Cost->foldTailByMasking()) { 2630 assert(isPowerOf2_32(VF * UF) && 2631 "VF*UF must be a power of 2 when folding tail by masking"); 2632 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up"); 2633 } 2634 2635 // Now we need to generate the expression for the part of the loop that the 2636 // vectorized body will execute. This is equal to N - (N % Step) if scalar 2637 // iterations are not required for correctness, or N - Step, otherwise. Step 2638 // is equal to the vectorization factor (number of SIMD elements) times the 2639 // unroll factor (number of SIMD instructions). 2640 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); 2641 2642 // If there is a non-reversed interleaved group that may speculatively access 2643 // memory out-of-bounds, we need to ensure that there will be at least one 2644 // iteration of the scalar epilogue loop. Thus, if the step evenly divides 2645 // the trip count, we set the remainder to be equal to the step. If the step 2646 // does not evenly divide the trip count, no adjustment is necessary since 2647 // there will already be scalar iterations. Note that the minimum iterations 2648 // check ensures that N >= Step. 2649 if (VF > 1 && Cost->requiresScalarEpilogue()) { 2650 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); 2651 R = Builder.CreateSelect(IsZero, Step, R); 2652 } 2653 2654 VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); 2655 2656 return VectorTripCount; 2657 } 2658 2659 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, 2660 const DataLayout &DL) { 2661 // Verify that V is a vector type with same number of elements as DstVTy. 2662 unsigned VF = DstVTy->getNumElements(); 2663 VectorType *SrcVecTy = cast<VectorType>(V->getType()); 2664 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"); 2665 Type *SrcElemTy = SrcVecTy->getElementType(); 2666 Type *DstElemTy = DstVTy->getElementType(); 2667 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && 2668 "Vector elements must have same size"); 2669 2670 // Do a direct cast if element types are castable. 2671 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { 2672 return Builder.CreateBitOrPointerCast(V, DstVTy); 2673 } 2674 // V cannot be directly casted to desired vector type. 2675 // May happen when V is a floating point vector but DstVTy is a vector of 2676 // pointers or vice-versa. Handle this using a two-step bitcast using an 2677 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. 2678 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && 2679 "Only one type should be a pointer type"); 2680 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && 2681 "Only one type should be a floating point type"); 2682 Type *IntTy = 2683 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); 2684 VectorType *VecIntTy = VectorType::get(IntTy, VF); 2685 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); 2686 return Builder.CreateBitOrPointerCast(CastVal, DstVTy); 2687 } 2688 2689 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, 2690 BasicBlock *Bypass) { 2691 Value *Count = getOrCreateTripCount(L); 2692 // Reuse existing vector loop preheader for TC checks. 2693 // Note that new preheader block is generated for vector loop. 2694 BasicBlock *const TCCheckBlock = LoopVectorPreHeader; 2695 IRBuilder<> Builder(TCCheckBlock->getTerminator()); 2696 2697 // Generate code to check if the loop's trip count is less than VF * UF, or 2698 // equal to it in case a scalar epilogue is required; this implies that the 2699 // vector trip count is zero. This check also covers the case where adding one 2700 // to the backedge-taken count overflowed leading to an incorrect trip count 2701 // of zero. In this case we will also jump to the scalar loop. 2702 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE 2703 : ICmpInst::ICMP_ULT; 2704 2705 // If tail is to be folded, vector loop takes care of all iterations. 2706 Value *CheckMinIters = Builder.getFalse(); 2707 if (!Cost->foldTailByMasking()) 2708 CheckMinIters = Builder.CreateICmp( 2709 P, Count, ConstantInt::get(Count->getType(), VF * UF), 2710 "min.iters.check"); 2711 2712 // Create new preheader for vector loop. 2713 LoopVectorPreHeader = 2714 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, 2715 "vector.ph"); 2716 2717 assert(DT->properlyDominates(DT->getNode(TCCheckBlock), 2718 DT->getNode(Bypass)->getIDom()) && 2719 "TC check is expected to dominate Bypass"); 2720 2721 // Update dominator for Bypass & LoopExit. 2722 DT->changeImmediateDominator(Bypass, TCCheckBlock); 2723 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); 2724 2725 ReplaceInstWithInst( 2726 TCCheckBlock->getTerminator(), 2727 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); 2728 LoopBypassBlocks.push_back(TCCheckBlock); 2729 } 2730 2731 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { 2732 // Reuse existing vector loop preheader for SCEV checks. 2733 // Note that new preheader block is generated for vector loop. 2734 BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; 2735 2736 // Generate the code to check that the SCEV assumptions that we made. 2737 // We want the new basic block to start at the first instruction in a 2738 // sequence of instructions that form a check. 2739 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), 2740 "scev.check"); 2741 Value *SCEVCheck = Exp.expandCodeForPredicate( 2742 &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); 2743 2744 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) 2745 if (C->isZero()) 2746 return; 2747 2748 assert(!SCEVCheckBlock->getParent()->hasOptSize() && 2749 "Cannot SCEV check stride or overflow when optimizing for size"); 2750 2751 SCEVCheckBlock->setName("vector.scevcheck"); 2752 // Create new preheader for vector loop. 2753 LoopVectorPreHeader = 2754 SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, 2755 nullptr, "vector.ph"); 2756 2757 // Update dominator only if this is first RT check. 2758 if (LoopBypassBlocks.empty()) { 2759 DT->changeImmediateDominator(Bypass, SCEVCheckBlock); 2760 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); 2761 } 2762 2763 ReplaceInstWithInst( 2764 SCEVCheckBlock->getTerminator(), 2765 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); 2766 LoopBypassBlocks.push_back(SCEVCheckBlock); 2767 AddedSafetyChecks = true; 2768 } 2769 2770 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { 2771 // VPlan-native path does not do any analysis for runtime checks currently. 2772 if (EnableVPlanNativePath) 2773 return; 2774 2775 // Reuse existing vector loop preheader for runtime memory checks. 2776 // Note that new preheader block is generated for vector loop. 2777 BasicBlock *const MemCheckBlock = L->getLoopPreheader(); 2778 2779 // Generate the code that checks in runtime if arrays overlap. We put the 2780 // checks into a separate block to make the more common case of few elements 2781 // faster. 2782 Instruction *FirstCheckInst; 2783 Instruction *MemRuntimeCheck; 2784 std::tie(FirstCheckInst, MemRuntimeCheck) = 2785 Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); 2786 if (!MemRuntimeCheck) 2787 return; 2788 2789 if (MemCheckBlock->getParent()->hasOptSize()) { 2790 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && 2791 "Cannot emit memory checks when optimizing for size, unless forced " 2792 "to vectorize."); 2793 ORE->emit([&]() { 2794 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", 2795 L->getStartLoc(), L->getHeader()) 2796 << "Code-size may be reduced by not forcing " 2797 "vectorization, or by source-code modifications " 2798 "eliminating the need for runtime checks " 2799 "(e.g., adding 'restrict')."; 2800 }); 2801 } 2802 2803 MemCheckBlock->setName("vector.memcheck"); 2804 // Create new preheader for vector loop. 2805 LoopVectorPreHeader = 2806 SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, 2807 "vector.ph"); 2808 2809 // Update dominator only if this is first RT check. 2810 if (LoopBypassBlocks.empty()) { 2811 DT->changeImmediateDominator(Bypass, MemCheckBlock); 2812 DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); 2813 } 2814 2815 ReplaceInstWithInst( 2816 MemCheckBlock->getTerminator(), 2817 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); 2818 LoopBypassBlocks.push_back(MemCheckBlock); 2819 AddedSafetyChecks = true; 2820 2821 // We currently don't use LoopVersioning for the actual loop cloning but we 2822 // still use it to add the noalias metadata. 2823 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, 2824 PSE.getSE()); 2825 LVer->prepareNoAliasMetadata(); 2826 } 2827 2828 Value *InnerLoopVectorizer::emitTransformedIndex( 2829 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, 2830 const InductionDescriptor &ID) const { 2831 2832 SCEVExpander Exp(*SE, DL, "induction"); 2833 auto Step = ID.getStep(); 2834 auto StartValue = ID.getStartValue(); 2835 assert(Index->getType() == Step->getType() && 2836 "Index type does not match StepValue type"); 2837 2838 // Note: the IR at this point is broken. We cannot use SE to create any new 2839 // SCEV and then expand it, hoping that SCEV's simplification will give us 2840 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2841 // lead to various SCEV crashes. So all we can do is to use builder and rely 2842 // on InstCombine for future simplifications. Here we handle some trivial 2843 // cases only. 2844 auto CreateAdd = [&B](Value *X, Value *Y) { 2845 assert(X->getType() == Y->getType() && "Types don't match!"); 2846 if (auto *CX = dyn_cast<ConstantInt>(X)) 2847 if (CX->isZero()) 2848 return Y; 2849 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2850 if (CY->isZero()) 2851 return X; 2852 return B.CreateAdd(X, Y); 2853 }; 2854 2855 auto CreateMul = [&B](Value *X, Value *Y) { 2856 assert(X->getType() == Y->getType() && "Types don't match!"); 2857 if (auto *CX = dyn_cast<ConstantInt>(X)) 2858 if (CX->isOne()) 2859 return Y; 2860 if (auto *CY = dyn_cast<ConstantInt>(Y)) 2861 if (CY->isOne()) 2862 return X; 2863 return B.CreateMul(X, Y); 2864 }; 2865 2866 switch (ID.getKind()) { 2867 case InductionDescriptor::IK_IntInduction: { 2868 assert(Index->getType() == StartValue->getType() && 2869 "Index type does not match StartValue type"); 2870 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) 2871 return B.CreateSub(StartValue, Index); 2872 auto *Offset = CreateMul( 2873 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint())); 2874 return CreateAdd(StartValue, Offset); 2875 } 2876 case InductionDescriptor::IK_PtrInduction: { 2877 assert(isa<SCEVConstant>(Step) && 2878 "Expected constant step for pointer induction"); 2879 return B.CreateGEP( 2880 StartValue->getType()->getPointerElementType(), StartValue, 2881 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(), 2882 &*B.GetInsertPoint()))); 2883 } 2884 case InductionDescriptor::IK_FpInduction: { 2885 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); 2886 auto InductionBinOp = ID.getInductionBinOp(); 2887 assert(InductionBinOp && 2888 (InductionBinOp->getOpcode() == Instruction::FAdd || 2889 InductionBinOp->getOpcode() == Instruction::FSub) && 2890 "Original bin op should be defined for FP induction"); 2891 2892 Value *StepValue = cast<SCEVUnknown>(Step)->getValue(); 2893 2894 // Floating point operations had to be 'fast' to enable the induction. 2895 FastMathFlags Flags; 2896 Flags.setFast(); 2897 2898 Value *MulExp = B.CreateFMul(StepValue, Index); 2899 if (isa<Instruction>(MulExp)) 2900 // We have to check, the MulExp may be a constant. 2901 cast<Instruction>(MulExp)->setFastMathFlags(Flags); 2902 2903 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, 2904 "induction"); 2905 if (isa<Instruction>(BOp)) 2906 cast<Instruction>(BOp)->setFastMathFlags(Flags); 2907 2908 return BOp; 2909 } 2910 case InductionDescriptor::IK_NoInduction: 2911 return nullptr; 2912 } 2913 llvm_unreachable("invalid enum"); 2914 } 2915 2916 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { 2917 /* 2918 In this function we generate a new loop. The new loop will contain 2919 the vectorized instructions while the old loop will continue to run the 2920 scalar remainder. 2921 2922 [ ] <-- loop iteration number check. 2923 / | 2924 / v 2925 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2926 | / | 2927 | / v 2928 || [ ] <-- vector pre header. 2929 |/ | 2930 | v 2931 | [ ] \ 2932 | [ ]_| <-- vector loop. 2933 | | 2934 | v 2935 | -[ ] <--- middle-block. 2936 | / | 2937 | / v 2938 -|- >[ ] <--- new preheader. 2939 | | 2940 | v 2941 | [ ] \ 2942 | [ ]_| <-- old scalar loop to handle remainder. 2943 \ | 2944 \ v 2945 >[ ] <-- exit block. 2946 ... 2947 */ 2948 2949 MDNode *OrigLoopID = OrigLoop->getLoopID(); 2950 2951 // Some loops have a single integer induction variable, while other loops 2952 // don't. One example is c++ iterators that often have multiple pointer 2953 // induction variables. In the code below we also support a case where we 2954 // don't have a single induction variable. 2955 // 2956 // We try to obtain an induction variable from the original loop as hard 2957 // as possible. However if we don't find one that: 2958 // - is an integer 2959 // - counts from zero, stepping by one 2960 // - is the size of the widest induction variable type 2961 // then we create a new one. 2962 OldInduction = Legal->getPrimaryInduction(); 2963 Type *IdxTy = Legal->getWidestInductionType(); 2964 2965 // Split the single block loop into the two loop structure described above. 2966 LoopScalarBody = OrigLoop->getHeader(); 2967 LoopVectorPreHeader = OrigLoop->getLoopPreheader(); 2968 LoopExitBlock = OrigLoop->getExitBlock(); 2969 assert(LoopExitBlock && "Must have an exit block"); 2970 assert(LoopVectorPreHeader && "Invalid loop structure"); 2971 2972 LoopMiddleBlock = 2973 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2974 LI, nullptr, "middle.block"); 2975 LoopScalarPreHeader = 2976 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, 2977 nullptr, "scalar.ph"); 2978 // We intentionally don't let SplitBlock to update LoopInfo since 2979 // LoopVectorBody should belong to another loop than LoopVectorPreHeader. 2980 // LoopVectorBody is explicitly added to the correct place few lines later. 2981 LoopVectorBody = 2982 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, 2983 nullptr, nullptr, "vector.body"); 2984 2985 // Update dominator for loop exit. 2986 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); 2987 2988 // Create and register the new vector loop. 2989 Loop *Lp = LI->AllocateLoop(); 2990 Loop *ParentLoop = OrigLoop->getParentLoop(); 2991 2992 // Insert the new loop into the loop nest and register the new basic blocks 2993 // before calling any utilities such as SCEV that require valid LoopInfo. 2994 if (ParentLoop) { 2995 ParentLoop->addChildLoop(Lp); 2996 } else { 2997 LI->addTopLevelLoop(Lp); 2998 } 2999 Lp->addBasicBlockToLoop(LoopVectorBody, *LI); 3000 3001 // Find the loop boundaries. 3002 Value *Count = getOrCreateTripCount(Lp); 3003 3004 Value *StartIdx = ConstantInt::get(IdxTy, 0); 3005 3006 // Now, compare the new count to zero. If it is zero skip the vector loop and 3007 // jump to the scalar loop. This check also covers the case where the 3008 // backedge-taken count is uint##_max: adding one to it will overflow leading 3009 // to an incorrect trip count of zero. In this (rare) case we will also jump 3010 // to the scalar loop. 3011 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); 3012 3013 // Generate the code to check any assumptions that we've made for SCEV 3014 // expressions. 3015 emitSCEVChecks(Lp, LoopScalarPreHeader); 3016 3017 // Generate the code that checks in runtime if arrays overlap. We put the 3018 // checks into a separate block to make the more common case of few elements 3019 // faster. 3020 emitMemRuntimeChecks(Lp, LoopScalarPreHeader); 3021 3022 // Generate the induction variable. 3023 // The loop step is equal to the vectorization factor (num of SIMD elements) 3024 // times the unroll factor (num of SIMD instructions). 3025 Value *CountRoundDown = getOrCreateVectorTripCount(Lp); 3026 Constant *Step = ConstantInt::get(IdxTy, VF * UF); 3027 Induction = 3028 createInductionVariable(Lp, StartIdx, CountRoundDown, Step, 3029 getDebugLocFromInstOrOperands(OldInduction)); 3030 3031 // We are going to resume the execution of the scalar loop. 3032 // Go over all of the induction variables that we found and fix the 3033 // PHIs that are left in the scalar version of the loop. 3034 // The starting values of PHI nodes depend on the counter of the last 3035 // iteration in the vectorized loop. 3036 // If we come from a bypass edge then we need to start from the original 3037 // start value. 3038 3039 // This variable saves the new starting index for the scalar loop. It is used 3040 // to test if there are any tail iterations left once the vector loop has 3041 // completed. 3042 for (auto &InductionEntry : Legal->getInductionVars()) { 3043 PHINode *OrigPhi = InductionEntry.first; 3044 InductionDescriptor II = InductionEntry.second; 3045 3046 // Create phi nodes to merge from the backedge-taken check block. 3047 PHINode *BCResumeVal = 3048 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", 3049 LoopScalarPreHeader->getTerminator()); 3050 // Copy original phi DL over to the new one. 3051 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); 3052 Value *&EndValue = IVEndValues[OrigPhi]; 3053 if (OrigPhi == OldInduction) { 3054 // We know what the end value is. 3055 EndValue = CountRoundDown; 3056 } else { 3057 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); 3058 Type *StepType = II.getStep()->getType(); 3059 Instruction::CastOps CastOp = 3060 CastInst::getCastOpcode(CountRoundDown, true, StepType, true); 3061 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); 3062 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); 3063 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); 3064 EndValue->setName("ind.end"); 3065 } 3066 3067 // The new PHI merges the original incoming value, in case of a bypass, 3068 // or the value at the end of the vectorized loop. 3069 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); 3070 3071 // Fix the scalar body counter (PHI node). 3072 // The old induction's phi node in the scalar body needs the truncated 3073 // value. 3074 for (BasicBlock *BB : LoopBypassBlocks) 3075 BCResumeVal->addIncoming(II.getStartValue(), BB); 3076 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); 3077 } 3078 3079 // We need the OrigLoop (scalar loop part) latch terminator to help 3080 // produce correct debug info for the middle block BB instructions. 3081 // The legality check stage guarantees that the loop will have a single 3082 // latch. 3083 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && 3084 "Scalar loop latch terminator isn't a branch"); 3085 BranchInst *ScalarLatchBr = 3086 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()); 3087 3088 // Add a check in the middle block to see if we have completed 3089 // all of the iterations in the first vector loop. 3090 // If (N - N%VF) == N, then we *don't* need to run the remainder. 3091 // If tail is to be folded, we know we don't need to run the remainder. 3092 Value *CmpN = Builder.getTrue(); 3093 if (!Cost->foldTailByMasking()) { 3094 CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, 3095 CountRoundDown, "cmp.n", 3096 LoopMiddleBlock->getTerminator()); 3097 3098 // Here we use the same DebugLoc as the scalar loop latch branch instead 3099 // of the corresponding compare because they may have ended up with 3100 // different line numbers and we want to avoid awkward line stepping while 3101 // debugging. Eg. if the compare has got a line number inside the loop. 3102 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3103 } 3104 3105 BranchInst *BrInst = 3106 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); 3107 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); 3108 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); 3109 3110 // Get ready to start creating new instructions into the vectorized body. 3111 assert(LoopVectorPreHeader == Lp->getLoopPreheader() && 3112 "Inconsistent vector loop preheader"); 3113 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); 3114 3115 Optional<MDNode *> VectorizedLoopID = 3116 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 3117 LLVMLoopVectorizeFollowupVectorized}); 3118 if (VectorizedLoopID.hasValue()) { 3119 Lp->setLoopID(VectorizedLoopID.getValue()); 3120 3121 // Do not setAlreadyVectorized if loop attributes have been defined 3122 // explicitly. 3123 return LoopVectorPreHeader; 3124 } 3125 3126 // Keep all loop hints from the original loop on the vector loop (we'll 3127 // replace the vectorizer-specific hints below). 3128 if (MDNode *LID = OrigLoop->getLoopID()) 3129 Lp->setLoopID(LID); 3130 3131 LoopVectorizeHints Hints(Lp, true, *ORE); 3132 Hints.setAlreadyVectorized(); 3133 3134 #ifdef EXPENSIVE_CHECKS 3135 assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 3136 LI->verify(*DT); 3137 #endif 3138 3139 return LoopVectorPreHeader; 3140 } 3141 3142 // Fix up external users of the induction variable. At this point, we are 3143 // in LCSSA form, with all external PHIs that use the IV having one input value, 3144 // coming from the remainder loop. We need those PHIs to also have a correct 3145 // value for the IV when arriving directly from the middle block. 3146 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, 3147 const InductionDescriptor &II, 3148 Value *CountRoundDown, Value *EndValue, 3149 BasicBlock *MiddleBlock) { 3150 // There are two kinds of external IV usages - those that use the value 3151 // computed in the last iteration (the PHI) and those that use the penultimate 3152 // value (the value that feeds into the phi from the loop latch). 3153 // We allow both, but they, obviously, have different values. 3154 3155 assert(OrigLoop->getExitBlock() && "Expected a single exit block"); 3156 3157 DenseMap<Value *, Value *> MissingVals; 3158 3159 // An external user of the last iteration's value should see the value that 3160 // the remainder loop uses to initialize its own IV. 3161 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); 3162 for (User *U : PostInc->users()) { 3163 Instruction *UI = cast<Instruction>(U); 3164 if (!OrigLoop->contains(UI)) { 3165 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3166 MissingVals[UI] = EndValue; 3167 } 3168 } 3169 3170 // An external user of the penultimate value need to see EndValue - Step. 3171 // The simplest way to get this is to recompute it from the constituent SCEVs, 3172 // that is Start + (Step * (CRD - 1)). 3173 for (User *U : OrigPhi->users()) { 3174 auto *UI = cast<Instruction>(U); 3175 if (!OrigLoop->contains(UI)) { 3176 const DataLayout &DL = 3177 OrigLoop->getHeader()->getModule()->getDataLayout(); 3178 assert(isa<PHINode>(UI) && "Expected LCSSA form"); 3179 3180 IRBuilder<> B(MiddleBlock->getTerminator()); 3181 Value *CountMinusOne = B.CreateSub( 3182 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); 3183 Value *CMO = 3184 !II.getStep()->getType()->isIntegerTy() 3185 ? B.CreateCast(Instruction::SIToFP, CountMinusOne, 3186 II.getStep()->getType()) 3187 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); 3188 CMO->setName("cast.cmo"); 3189 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); 3190 Escape->setName("ind.escape"); 3191 MissingVals[UI] = Escape; 3192 } 3193 } 3194 3195 for (auto &I : MissingVals) { 3196 PHINode *PHI = cast<PHINode>(I.first); 3197 // One corner case we have to handle is two IVs "chasing" each-other, 3198 // that is %IV2 = phi [...], [ %IV1, %latch ] 3199 // In this case, if IV1 has an external use, we need to avoid adding both 3200 // "last value of IV1" and "penultimate value of IV2". So, verify that we 3201 // don't already have an incoming value for the middle block. 3202 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) 3203 PHI->addIncoming(I.second, MiddleBlock); 3204 } 3205 } 3206 3207 namespace { 3208 3209 struct CSEDenseMapInfo { 3210 static bool canHandle(const Instruction *I) { 3211 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || 3212 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I); 3213 } 3214 3215 static inline Instruction *getEmptyKey() { 3216 return DenseMapInfo<Instruction *>::getEmptyKey(); 3217 } 3218 3219 static inline Instruction *getTombstoneKey() { 3220 return DenseMapInfo<Instruction *>::getTombstoneKey(); 3221 } 3222 3223 static unsigned getHashValue(const Instruction *I) { 3224 assert(canHandle(I) && "Unknown instruction!"); 3225 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(), 3226 I->value_op_end())); 3227 } 3228 3229 static bool isEqual(const Instruction *LHS, const Instruction *RHS) { 3230 if (LHS == getEmptyKey() || RHS == getEmptyKey() || 3231 LHS == getTombstoneKey() || RHS == getTombstoneKey()) 3232 return LHS == RHS; 3233 return LHS->isIdenticalTo(RHS); 3234 } 3235 }; 3236 3237 } // end anonymous namespace 3238 3239 ///Perform cse of induction variable instructions. 3240 static void cse(BasicBlock *BB) { 3241 // Perform simple cse. 3242 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; 3243 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 3244 Instruction *In = &*I++; 3245 3246 if (!CSEDenseMapInfo::canHandle(In)) 3247 continue; 3248 3249 // Check if we can replace this instruction with any of the 3250 // visited instructions. 3251 if (Instruction *V = CSEMap.lookup(In)) { 3252 In->replaceAllUsesWith(V); 3253 In->eraseFromParent(); 3254 continue; 3255 } 3256 3257 CSEMap[In] = In; 3258 } 3259 } 3260 3261 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, 3262 unsigned VF, 3263 bool &NeedToScalarize) { 3264 Function *F = CI->getCalledFunction(); 3265 Type *ScalarRetTy = CI->getType(); 3266 SmallVector<Type *, 4> Tys, ScalarTys; 3267 for (auto &ArgOp : CI->arg_operands()) 3268 ScalarTys.push_back(ArgOp->getType()); 3269 3270 // Estimate cost of scalarized vector call. The source operands are assumed 3271 // to be vectors, so we need to extract individual elements from there, 3272 // execute VF scalar calls, and then gather the result into the vector return 3273 // value. 3274 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); 3275 if (VF == 1) 3276 return ScalarCallCost; 3277 3278 // Compute corresponding vector type for return value and arguments. 3279 Type *RetTy = ToVectorTy(ScalarRetTy, VF); 3280 for (Type *ScalarTy : ScalarTys) 3281 Tys.push_back(ToVectorTy(ScalarTy, VF)); 3282 3283 // Compute costs of unpacking argument values for the scalar calls and 3284 // packing the return values to a vector. 3285 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); 3286 3287 unsigned Cost = ScalarCallCost * VF + ScalarizationCost; 3288 3289 // If we can't emit a vector call for this function, then the currently found 3290 // cost is the cost we need to return. 3291 NeedToScalarize = true; 3292 VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/); 3293 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); 3294 3295 if (!TLI || CI->isNoBuiltin() || !VecFunc) 3296 return Cost; 3297 3298 // If the corresponding vector cost is cheaper, return its cost. 3299 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); 3300 if (VectorCallCost < Cost) { 3301 NeedToScalarize = false; 3302 return VectorCallCost; 3303 } 3304 return Cost; 3305 } 3306 3307 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, 3308 unsigned VF) { 3309 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 3310 assert(ID && "Expected intrinsic call!"); 3311 3312 FastMathFlags FMF; 3313 if (auto *FPMO = dyn_cast<FPMathOperator>(CI)) 3314 FMF = FPMO->getFastMathFlags(); 3315 3316 SmallVector<Value *, 4> Operands(CI->arg_operands()); 3317 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF, CI); 3318 } 3319 3320 static Type *smallestIntegerVectorType(Type *T1, Type *T2) { 3321 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3322 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3323 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; 3324 } 3325 static Type *largestIntegerVectorType(Type *T1, Type *T2) { 3326 auto *I1 = cast<IntegerType>(T1->getVectorElementType()); 3327 auto *I2 = cast<IntegerType>(T2->getVectorElementType()); 3328 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; 3329 } 3330 3331 void InnerLoopVectorizer::truncateToMinimalBitwidths() { 3332 // For every instruction `I` in MinBWs, truncate the operands, create a 3333 // truncated version of `I` and reextend its result. InstCombine runs 3334 // later and will remove any ext/trunc pairs. 3335 SmallPtrSet<Value *, 4> Erased; 3336 for (const auto &KV : Cost->getMinimalBitwidths()) { 3337 // If the value wasn't vectorized, we must maintain the original scalar 3338 // type. The absence of the value from VectorLoopValueMap indicates that it 3339 // wasn't vectorized. 3340 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3341 continue; 3342 for (unsigned Part = 0; Part < UF; ++Part) { 3343 Value *I = getOrCreateVectorValue(KV.first, Part); 3344 if (Erased.find(I) != Erased.end() || I->use_empty() || 3345 !isa<Instruction>(I)) 3346 continue; 3347 Type *OriginalTy = I->getType(); 3348 Type *ScalarTruncatedTy = 3349 IntegerType::get(OriginalTy->getContext(), KV.second); 3350 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, 3351 OriginalTy->getVectorNumElements()); 3352 if (TruncatedTy == OriginalTy) 3353 continue; 3354 3355 IRBuilder<> B(cast<Instruction>(I)); 3356 auto ShrinkOperand = [&](Value *V) -> Value * { 3357 if (auto *ZI = dyn_cast<ZExtInst>(V)) 3358 if (ZI->getSrcTy() == TruncatedTy) 3359 return ZI->getOperand(0); 3360 return B.CreateZExtOrTrunc(V, TruncatedTy); 3361 }; 3362 3363 // The actual instruction modification depends on the instruction type, 3364 // unfortunately. 3365 Value *NewI = nullptr; 3366 if (auto *BO = dyn_cast<BinaryOperator>(I)) { 3367 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)), 3368 ShrinkOperand(BO->getOperand(1))); 3369 3370 // Any wrapping introduced by shrinking this operation shouldn't be 3371 // considered undefined behavior. So, we can't unconditionally copy 3372 // arithmetic wrapping flags to NewI. 3373 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false); 3374 } else if (auto *CI = dyn_cast<ICmpInst>(I)) { 3375 NewI = 3376 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)), 3377 ShrinkOperand(CI->getOperand(1))); 3378 } else if (auto *SI = dyn_cast<SelectInst>(I)) { 3379 NewI = B.CreateSelect(SI->getCondition(), 3380 ShrinkOperand(SI->getTrueValue()), 3381 ShrinkOperand(SI->getFalseValue())); 3382 } else if (auto *CI = dyn_cast<CastInst>(I)) { 3383 switch (CI->getOpcode()) { 3384 default: 3385 llvm_unreachable("Unhandled cast!"); 3386 case Instruction::Trunc: 3387 NewI = ShrinkOperand(CI->getOperand(0)); 3388 break; 3389 case Instruction::SExt: 3390 NewI = B.CreateSExtOrTrunc( 3391 CI->getOperand(0), 3392 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3393 break; 3394 case Instruction::ZExt: 3395 NewI = B.CreateZExtOrTrunc( 3396 CI->getOperand(0), 3397 smallestIntegerVectorType(OriginalTy, TruncatedTy)); 3398 break; 3399 } 3400 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) { 3401 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); 3402 auto *O0 = B.CreateZExtOrTrunc( 3403 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0)); 3404 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); 3405 auto *O1 = B.CreateZExtOrTrunc( 3406 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1)); 3407 3408 NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); 3409 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) { 3410 // Don't do anything with the operands, just extend the result. 3411 continue; 3412 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) { 3413 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements(); 3414 auto *O0 = B.CreateZExtOrTrunc( 3415 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3416 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy); 3417 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2)); 3418 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { 3419 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements(); 3420 auto *O0 = B.CreateZExtOrTrunc( 3421 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements)); 3422 NewI = B.CreateExtractElement(O0, EE->getOperand(2)); 3423 } else { 3424 // If we don't know what to do, be conservative and don't do anything. 3425 continue; 3426 } 3427 3428 // Lastly, extend the result. 3429 NewI->takeName(cast<Instruction>(I)); 3430 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); 3431 I->replaceAllUsesWith(Res); 3432 cast<Instruction>(I)->eraseFromParent(); 3433 Erased.insert(I); 3434 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res); 3435 } 3436 } 3437 3438 // We'll have created a bunch of ZExts that are now parentless. Clean up. 3439 for (const auto &KV : Cost->getMinimalBitwidths()) { 3440 // If the value wasn't vectorized, we must maintain the original scalar 3441 // type. The absence of the value from VectorLoopValueMap indicates that it 3442 // wasn't vectorized. 3443 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first)) 3444 continue; 3445 for (unsigned Part = 0; Part < UF; ++Part) { 3446 Value *I = getOrCreateVectorValue(KV.first, Part); 3447 ZExtInst *Inst = dyn_cast<ZExtInst>(I); 3448 if (Inst && Inst->use_empty()) { 3449 Value *NewI = Inst->getOperand(0); 3450 Inst->eraseFromParent(); 3451 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI); 3452 } 3453 } 3454 } 3455 } 3456 3457 void InnerLoopVectorizer::fixVectorizedLoop() { 3458 // Insert truncates and extends for any truncated instructions as hints to 3459 // InstCombine. 3460 if (VF > 1) 3461 truncateToMinimalBitwidths(); 3462 3463 // Fix widened non-induction PHIs by setting up the PHI operands. 3464 if (OrigPHIsToFix.size()) { 3465 assert(EnableVPlanNativePath && 3466 "Unexpected non-induction PHIs for fixup in non VPlan-native path"); 3467 fixNonInductionPHIs(); 3468 } 3469 3470 // At this point every instruction in the original loop is widened to a 3471 // vector form. Now we need to fix the recurrences in the loop. These PHI 3472 // nodes are currently empty because we did not want to introduce cycles. 3473 // This is the second stage of vectorizing recurrences. 3474 fixCrossIterationPHIs(); 3475 3476 // Forget the original basic block. 3477 PSE.getSE()->forgetLoop(OrigLoop); 3478 3479 // Fix-up external users of the induction variables. 3480 for (auto &Entry : Legal->getInductionVars()) 3481 fixupIVUsers(Entry.first, Entry.second, 3482 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)), 3483 IVEndValues[Entry.first], LoopMiddleBlock); 3484 3485 fixLCSSAPHIs(); 3486 for (Instruction *PI : PredicatedInstructions) 3487 sinkScalarOperands(&*PI); 3488 3489 // Remove redundant induction instructions. 3490 cse(LoopVectorBody); 3491 3492 // Set/update profile weights for the vector and remainder loops as original 3493 // loop iterations are now distributed among them. Note that original loop 3494 // represented by LoopScalarBody becomes remainder loop after vectorization. 3495 // 3496 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may 3497 // end up getting slightly roughened result but that should be OK since 3498 // profile is not inherently precise anyway. Note also possible bypass of 3499 // vector code caused by legality checks is ignored, assigning all the weight 3500 // to the vector loop, optimistically. 3501 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), 3502 LI->getLoopFor(LoopVectorBody), 3503 LI->getLoopFor(LoopScalarBody), VF * UF); 3504 } 3505 3506 void InnerLoopVectorizer::fixCrossIterationPHIs() { 3507 // In order to support recurrences we need to be able to vectorize Phi nodes. 3508 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 3509 // stage #2: We now need to fix the recurrences by adding incoming edges to 3510 // the currently empty PHI nodes. At this point every instruction in the 3511 // original loop is widened to a vector form so we can use them to construct 3512 // the incoming edges. 3513 for (PHINode &Phi : OrigLoop->getHeader()->phis()) { 3514 // Handle first-order recurrences and reductions that need to be fixed. 3515 if (Legal->isFirstOrderRecurrence(&Phi)) 3516 fixFirstOrderRecurrence(&Phi); 3517 else if (Legal->isReductionVariable(&Phi)) 3518 fixReduction(&Phi); 3519 } 3520 } 3521 3522 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { 3523 // This is the second phase of vectorizing first-order recurrences. An 3524 // overview of the transformation is described below. Suppose we have the 3525 // following loop. 3526 // 3527 // for (int i = 0; i < n; ++i) 3528 // b[i] = a[i] - a[i - 1]; 3529 // 3530 // There is a first-order recurrence on "a". For this loop, the shorthand 3531 // scalar IR looks like: 3532 // 3533 // scalar.ph: 3534 // s_init = a[-1] 3535 // br scalar.body 3536 // 3537 // scalar.body: 3538 // i = phi [0, scalar.ph], [i+1, scalar.body] 3539 // s1 = phi [s_init, scalar.ph], [s2, scalar.body] 3540 // s2 = a[i] 3541 // b[i] = s2 - s1 3542 // br cond, scalar.body, ... 3543 // 3544 // In this example, s1 is a recurrence because it's value depends on the 3545 // previous iteration. In the first phase of vectorization, we created a 3546 // temporary value for s1. We now complete the vectorization and produce the 3547 // shorthand vector IR shown below (for VF = 4, UF = 1). 3548 // 3549 // vector.ph: 3550 // v_init = vector(..., ..., ..., a[-1]) 3551 // br vector.body 3552 // 3553 // vector.body 3554 // i = phi [0, vector.ph], [i+4, vector.body] 3555 // v1 = phi [v_init, vector.ph], [v2, vector.body] 3556 // v2 = a[i, i+1, i+2, i+3]; 3557 // v3 = vector(v1(3), v2(0, 1, 2)) 3558 // b[i, i+1, i+2, i+3] = v2 - v3 3559 // br cond, vector.body, middle.block 3560 // 3561 // middle.block: 3562 // x = v2(3) 3563 // br scalar.ph 3564 // 3565 // scalar.ph: 3566 // s_init = phi [x, middle.block], [a[-1], otherwise] 3567 // br scalar.body 3568 // 3569 // After execution completes the vector loop, we extract the next value of 3570 // the recurrence (x) to use as the initial value in the scalar loop. 3571 3572 // Get the original loop preheader and single loop latch. 3573 auto *Preheader = OrigLoop->getLoopPreheader(); 3574 auto *Latch = OrigLoop->getLoopLatch(); 3575 3576 // Get the initial and previous values of the scalar recurrence. 3577 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader); 3578 auto *Previous = Phi->getIncomingValueForBlock(Latch); 3579 3580 // Create a vector from the initial value. 3581 auto *VectorInit = ScalarInit; 3582 if (VF > 1) { 3583 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3584 VectorInit = Builder.CreateInsertElement( 3585 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, 3586 Builder.getInt32(VF - 1), "vector.recur.init"); 3587 } 3588 3589 // We constructed a temporary phi node in the first phase of vectorization. 3590 // This phi node will eventually be deleted. 3591 Builder.SetInsertPoint( 3592 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0))); 3593 3594 // Create a phi node for the new recurrence. The current value will either be 3595 // the initial value inserted into a vector or loop-varying vector value. 3596 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur"); 3597 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader); 3598 3599 // Get the vectorized previous value of the last part UF - 1. It appears last 3600 // among all unrolled iterations, due to the order of their construction. 3601 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); 3602 3603 // Find and set the insertion point after the previous value if it is an 3604 // instruction. 3605 BasicBlock::iterator InsertPt; 3606 // Note that the previous value may have been constant-folded so it is not 3607 // guaranteed to be an instruction in the vector loop. 3608 // FIXME: Loop invariant values do not form recurrences. We should deal with 3609 // them earlier. 3610 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) 3611 InsertPt = LoopVectorBody->getFirstInsertionPt(); 3612 else { 3613 Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); 3614 if (isa<PHINode>(PreviousLastPart)) 3615 // If the previous value is a phi node, we should insert after all the phi 3616 // nodes in the block containing the PHI to avoid breaking basic block 3617 // verification. Note that the basic block may be different to 3618 // LoopVectorBody, in case we predicate the loop. 3619 InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); 3620 else 3621 InsertPt = ++PreviousInst->getIterator(); 3622 } 3623 Builder.SetInsertPoint(&*InsertPt); 3624 3625 // We will construct a vector for the recurrence by combining the values for 3626 // the current and previous iterations. This is the required shuffle mask. 3627 SmallVector<Constant *, 8> ShuffleMask(VF); 3628 ShuffleMask[0] = Builder.getInt32(VF - 1); 3629 for (unsigned I = 1; I < VF; ++I) 3630 ShuffleMask[I] = Builder.getInt32(I + VF - 1); 3631 3632 // The vector from which to take the initial value for the current iteration 3633 // (actual or unrolled). Initially, this is the vector phi node. 3634 Value *Incoming = VecPhi; 3635 3636 // Shuffle the current and previous vector and update the vector parts. 3637 for (unsigned Part = 0; Part < UF; ++Part) { 3638 Value *PreviousPart = getOrCreateVectorValue(Previous, Part); 3639 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part); 3640 auto *Shuffle = 3641 VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart, 3642 ConstantVector::get(ShuffleMask)) 3643 : Incoming; 3644 PhiPart->replaceAllUsesWith(Shuffle); 3645 cast<Instruction>(PhiPart)->eraseFromParent(); 3646 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle); 3647 Incoming = PreviousPart; 3648 } 3649 3650 // Fix the latch value of the new recurrence in the vector loop. 3651 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3652 3653 // Extract the last vector element in the middle block. This will be the 3654 // initial value for the recurrence when jumping to the scalar loop. 3655 auto *ExtractForScalar = Incoming; 3656 if (VF > 1) { 3657 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3658 ExtractForScalar = Builder.CreateExtractElement( 3659 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract"); 3660 } 3661 // Extract the second last element in the middle block if the 3662 // Phi is used outside the loop. We need to extract the phi itself 3663 // and not the last element (the phi update in the current iteration). This 3664 // will be the value when jumping to the exit block from the LoopMiddleBlock, 3665 // when the scalar loop is not run at all. 3666 Value *ExtractForPhiUsedOutsideLoop = nullptr; 3667 if (VF > 1) 3668 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( 3669 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi"); 3670 // When loop is unrolled without vectorizing, initialize 3671 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of 3672 // `Incoming`. This is analogous to the vectorized case above: extracting the 3673 // second last element when VF > 1. 3674 else if (UF > 1) 3675 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2); 3676 3677 // Fix the initial value of the original recurrence in the scalar loop. 3678 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin()); 3679 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init"); 3680 for (auto *BB : predecessors(LoopScalarPreHeader)) { 3681 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; 3682 Start->addIncoming(Incoming, BB); 3683 } 3684 3685 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start); 3686 Phi->setName("scalar.recur"); 3687 3688 // Finally, fix users of the recurrence outside the loop. The users will need 3689 // either the last value of the scalar recurrence or the last value of the 3690 // vector recurrence we extracted in the middle block. Since the loop is in 3691 // LCSSA form, we just need to find all the phi nodes for the original scalar 3692 // recurrence in the exit block, and then add an edge for the middle block. 3693 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3694 if (LCSSAPhi.getIncomingValue(0) == Phi) { 3695 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); 3696 } 3697 } 3698 } 3699 3700 void InnerLoopVectorizer::fixReduction(PHINode *Phi) { 3701 Constant *Zero = Builder.getInt32(0); 3702 3703 // Get it's reduction variable descriptor. 3704 assert(Legal->isReductionVariable(Phi) && 3705 "Unable to find the reduction variable"); 3706 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; 3707 3708 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3709 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); 3710 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); 3711 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = 3712 RdxDesc.getMinMaxRecurrenceKind(); 3713 setDebugLocFromInst(Builder, ReductionStartValue); 3714 3715 // We need to generate a reduction vector from the incoming scalar. 3716 // To do so, we need to generate the 'identity' vector and override 3717 // one of the elements with the incoming scalar reduction. We need 3718 // to do it in the vector-loop preheader. 3719 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); 3720 3721 // This is the vector-clone of the value that leaves the loop. 3722 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType(); 3723 3724 // Find the reduction identity variable. Zero for addition, or, xor, 3725 // one for multiplication, -1 for And. 3726 Value *Identity; 3727 Value *VectorStart; 3728 if (RK == RecurrenceDescriptor::RK_IntegerMinMax || 3729 RK == RecurrenceDescriptor::RK_FloatMinMax) { 3730 // MinMax reduction have the start value as their identify. 3731 if (VF == 1) { 3732 VectorStart = Identity = ReductionStartValue; 3733 } else { 3734 VectorStart = Identity = 3735 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident"); 3736 } 3737 } else { 3738 // Handle other reduction kinds: 3739 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( 3740 RK, VecTy->getScalarType()); 3741 if (VF == 1) { 3742 Identity = Iden; 3743 // This vector is the Identity vector where the first element is the 3744 // incoming scalar reduction. 3745 VectorStart = ReductionStartValue; 3746 } else { 3747 Identity = ConstantVector::getSplat({VF, false}, Iden); 3748 3749 // This vector is the Identity vector where the first element is the 3750 // incoming scalar reduction. 3751 VectorStart = 3752 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero); 3753 } 3754 } 3755 3756 // Wrap flags are in general invalid after vectorization, clear them. 3757 clearReductionWrapFlags(RdxDesc); 3758 3759 // Fix the vector-loop phi. 3760 3761 // Reductions do not have to start at zero. They can start with 3762 // any loop invariant values. 3763 BasicBlock *Latch = OrigLoop->getLoopLatch(); 3764 Value *LoopVal = Phi->getIncomingValueForBlock(Latch); 3765 3766 for (unsigned Part = 0; Part < UF; ++Part) { 3767 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); 3768 Value *Val = getOrCreateVectorValue(LoopVal, Part); 3769 // Make sure to add the reduction start value only to the 3770 // first unroll part. 3771 Value *StartVal = (Part == 0) ? VectorStart : Identity; 3772 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); 3773 cast<PHINode>(VecRdxPhi) 3774 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); 3775 } 3776 3777 // Before each round, move the insertion point right between 3778 // the PHIs and the values we are going to write. 3779 // This allows us to write both PHINodes and the extractelement 3780 // instructions. 3781 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3782 3783 setDebugLocFromInst(Builder, LoopExitInst); 3784 3785 // If tail is folded by masking, the vector value to leave the loop should be 3786 // a Select choosing between the vectorized LoopExitInst and vectorized Phi, 3787 // instead of the former. 3788 if (Cost->foldTailByMasking()) { 3789 for (unsigned Part = 0; Part < UF; ++Part) { 3790 Value *VecLoopExitInst = 3791 VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3792 Value *Sel = nullptr; 3793 for (User *U : VecLoopExitInst->users()) { 3794 if (isa<SelectInst>(U)) { 3795 assert(!Sel && "Reduction exit feeding two selects"); 3796 Sel = U; 3797 } else 3798 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); 3799 } 3800 assert(Sel && "Reduction exit feeds no select"); 3801 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); 3802 } 3803 } 3804 3805 // If the vector reduction can be performed in a smaller type, we truncate 3806 // then extend the loop exit value to enable InstCombine to evaluate the 3807 // entire expression in the smaller type. 3808 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { 3809 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); 3810 Builder.SetInsertPoint( 3811 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); 3812 VectorParts RdxParts(UF); 3813 for (unsigned Part = 0; Part < UF; ++Part) { 3814 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3815 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3816 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) 3817 : Builder.CreateZExt(Trunc, VecTy); 3818 for (Value::user_iterator UI = RdxParts[Part]->user_begin(); 3819 UI != RdxParts[Part]->user_end();) 3820 if (*UI != Trunc) { 3821 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd); 3822 RdxParts[Part] = Extnd; 3823 } else { 3824 ++UI; 3825 } 3826 } 3827 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); 3828 for (unsigned Part = 0; Part < UF; ++Part) { 3829 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy); 3830 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]); 3831 } 3832 } 3833 3834 // Reduce all of the unrolled parts into a single vector. 3835 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0); 3836 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK); 3837 3838 // The middle block terminator has already been assigned a DebugLoc here (the 3839 // OrigLoop's single latch terminator). We want the whole middle block to 3840 // appear to execute on this line because: (a) it is all compiler generated, 3841 // (b) these instructions are always executed after evaluating the latch 3842 // conditional branch, and (c) other passes may add new predecessors which 3843 // terminate on this line. This is the easiest way to ensure we don't 3844 // accidentally cause an extra step back into the loop while debugging. 3845 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); 3846 for (unsigned Part = 1; Part < UF; ++Part) { 3847 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); 3848 if (Op != Instruction::ICmp && Op != Instruction::FCmp) 3849 // Floating point operations had to be 'fast' to enable the reduction. 3850 ReducedPartRdx = addFastMathFlag( 3851 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, 3852 ReducedPartRdx, "bin.rdx"), 3853 RdxDesc.getFastMathFlags()); 3854 else 3855 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, 3856 RdxPart); 3857 } 3858 3859 if (VF > 1) { 3860 bool NoNaN = Legal->hasFunNoNaNAttr(); 3861 ReducedPartRdx = 3862 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); 3863 // If the reduction can be performed in a smaller type, we need to extend 3864 // the reduction to the wider type before we branch to the original loop. 3865 if (Phi->getType() != RdxDesc.getRecurrenceType()) 3866 ReducedPartRdx = 3867 RdxDesc.isSigned() 3868 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType()) 3869 : Builder.CreateZExt(ReducedPartRdx, Phi->getType()); 3870 } 3871 3872 // Create a phi node that merges control-flow from the backedge-taken check 3873 // block and the middle block. 3874 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx", 3875 LoopScalarPreHeader->getTerminator()); 3876 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) 3877 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); 3878 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); 3879 3880 // Now, we need to fix the users of the reduction variable 3881 // inside and outside of the scalar remainder loop. 3882 // We know that the loop is in LCSSA form. We need to update the 3883 // PHI nodes in the exit blocks. 3884 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3885 // All PHINodes need to have a single entry edge, or two if 3886 // we already fixed them. 3887 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); 3888 3889 // We found a reduction value exit-PHI. Update it with the 3890 // incoming bypass edge. 3891 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst) 3892 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock); 3893 } // end of the LCSSA phi scan. 3894 3895 // Fix the scalar loop reduction variable with the incoming reduction sum 3896 // from the vector body and from the backedge value. 3897 int IncomingEdgeBlockIdx = 3898 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch()); 3899 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); 3900 // Pick the other block. 3901 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); 3902 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); 3903 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); 3904 } 3905 3906 void InnerLoopVectorizer::clearReductionWrapFlags( 3907 RecurrenceDescriptor &RdxDesc) { 3908 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); 3909 if (RK != RecurrenceDescriptor::RK_IntegerAdd && 3910 RK != RecurrenceDescriptor::RK_IntegerMult) 3911 return; 3912 3913 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); 3914 assert(LoopExitInstr && "null loop exit instruction"); 3915 SmallVector<Instruction *, 8> Worklist; 3916 SmallPtrSet<Instruction *, 8> Visited; 3917 Worklist.push_back(LoopExitInstr); 3918 Visited.insert(LoopExitInstr); 3919 3920 while (!Worklist.empty()) { 3921 Instruction *Cur = Worklist.pop_back_val(); 3922 if (isa<OverflowingBinaryOperator>(Cur)) 3923 for (unsigned Part = 0; Part < UF; ++Part) { 3924 Value *V = getOrCreateVectorValue(Cur, Part); 3925 cast<Instruction>(V)->dropPoisonGeneratingFlags(); 3926 } 3927 3928 for (User *U : Cur->users()) { 3929 Instruction *UI = cast<Instruction>(U); 3930 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && 3931 Visited.insert(UI).second) 3932 Worklist.push_back(UI); 3933 } 3934 } 3935 } 3936 3937 void InnerLoopVectorizer::fixLCSSAPHIs() { 3938 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { 3939 if (LCSSAPhi.getNumIncomingValues() == 1) { 3940 auto *IncomingValue = LCSSAPhi.getIncomingValue(0); 3941 // Non-instruction incoming values will have only one value. 3942 unsigned LastLane = 0; 3943 if (isa<Instruction>(IncomingValue)) 3944 LastLane = Cost->isUniformAfterVectorization( 3945 cast<Instruction>(IncomingValue), VF) 3946 ? 0 3947 : VF - 1; 3948 // Can be a loop invariant incoming value or the last scalar value to be 3949 // extracted from the vectorized loop. 3950 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); 3951 Value *lastIncomingValue = 3952 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane }); 3953 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); 3954 } 3955 } 3956 } 3957 3958 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { 3959 // The basic block and loop containing the predicated instruction. 3960 auto *PredBB = PredInst->getParent(); 3961 auto *VectorLoop = LI->getLoopFor(PredBB); 3962 3963 // Initialize a worklist with the operands of the predicated instruction. 3964 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); 3965 3966 // Holds instructions that we need to analyze again. An instruction may be 3967 // reanalyzed if we don't yet know if we can sink it or not. 3968 SmallVector<Instruction *, 8> InstsToReanalyze; 3969 3970 // Returns true if a given use occurs in the predicated block. Phi nodes use 3971 // their operands in their corresponding predecessor blocks. 3972 auto isBlockOfUsePredicated = [&](Use &U) -> bool { 3973 auto *I = cast<Instruction>(U.getUser()); 3974 BasicBlock *BB = I->getParent(); 3975 if (auto *Phi = dyn_cast<PHINode>(I)) 3976 BB = Phi->getIncomingBlock( 3977 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 3978 return BB == PredBB; 3979 }; 3980 3981 // Iteratively sink the scalarized operands of the predicated instruction 3982 // into the block we created for it. When an instruction is sunk, it's 3983 // operands are then added to the worklist. The algorithm ends after one pass 3984 // through the worklist doesn't sink a single instruction. 3985 bool Changed; 3986 do { 3987 // Add the instructions that need to be reanalyzed to the worklist, and 3988 // reset the changed indicator. 3989 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end()); 3990 InstsToReanalyze.clear(); 3991 Changed = false; 3992 3993 while (!Worklist.empty()) { 3994 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val()); 3995 3996 // We can't sink an instruction if it is a phi node, is already in the 3997 // predicated block, is not in the loop, or may have side effects. 3998 if (!I || isa<PHINode>(I) || I->getParent() == PredBB || 3999 !VectorLoop->contains(I) || I->mayHaveSideEffects()) 4000 continue; 4001 4002 // It's legal to sink the instruction if all its uses occur in the 4003 // predicated block. Otherwise, there's nothing to do yet, and we may 4004 // need to reanalyze the instruction. 4005 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { 4006 InstsToReanalyze.push_back(I); 4007 continue; 4008 } 4009 4010 // Move the instruction to the beginning of the predicated block, and add 4011 // it's operands to the worklist. 4012 I->moveBefore(&*PredBB->getFirstInsertionPt()); 4013 Worklist.insert(I->op_begin(), I->op_end()); 4014 4015 // The sinking may have enabled other instructions to be sunk, so we will 4016 // need to iterate. 4017 Changed = true; 4018 } 4019 } while (Changed); 4020 } 4021 4022 void InnerLoopVectorizer::fixNonInductionPHIs() { 4023 for (PHINode *OrigPhi : OrigPHIsToFix) { 4024 PHINode *NewPhi = 4025 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0)); 4026 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues(); 4027 4028 SmallVector<BasicBlock *, 2> ScalarBBPredecessors( 4029 predecessors(OrigPhi->getParent())); 4030 SmallVector<BasicBlock *, 2> VectorBBPredecessors( 4031 predecessors(NewPhi->getParent())); 4032 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() && 4033 "Scalar and Vector BB should have the same number of predecessors"); 4034 4035 // The insertion point in Builder may be invalidated by the time we get 4036 // here. Force the Builder insertion point to something valid so that we do 4037 // not run into issues during insertion point restore in 4038 // getOrCreateVectorValue calls below. 4039 Builder.SetInsertPoint(NewPhi); 4040 4041 // The predecessor order is preserved and we can rely on mapping between 4042 // scalar and vector block predecessors. 4043 for (unsigned i = 0; i < NumIncomingValues; ++i) { 4044 BasicBlock *NewPredBB = VectorBBPredecessors[i]; 4045 4046 // When looking up the new scalar/vector values to fix up, use incoming 4047 // values from original phi. 4048 Value *ScIncV = 4049 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]); 4050 4051 // Scalar incoming value may need a broadcast 4052 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0); 4053 NewPhi->addIncoming(NewIncV, NewPredBB); 4054 } 4055 } 4056 } 4057 4058 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, 4059 unsigned VF, bool IsPtrLoopInvariant, 4060 SmallBitVector &IsIndexLoopInvariant) { 4061 // Construct a vector GEP by widening the operands of the scalar GEP as 4062 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP 4063 // results in a vector of pointers when at least one operand of the GEP 4064 // is vector-typed. Thus, to keep the representation compact, we only use 4065 // vector-typed operands for loop-varying values. 4066 4067 if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { 4068 // If we are vectorizing, but the GEP has only loop-invariant operands, 4069 // the GEP we build (by only using vector-typed operands for 4070 // loop-varying values) would be a scalar pointer. Thus, to ensure we 4071 // produce a vector of pointers, we need to either arbitrarily pick an 4072 // operand to broadcast, or broadcast a clone of the original GEP. 4073 // Here, we broadcast a clone of the original. 4074 // 4075 // TODO: If at some point we decide to scalarize instructions having 4076 // loop-invariant operands, this special case will no longer be 4077 // required. We would add the scalarization decision to 4078 // collectLoopScalars() and teach getVectorValue() to broadcast 4079 // the lane-zero scalar value. 4080 auto *Clone = Builder.Insert(GEP->clone()); 4081 for (unsigned Part = 0; Part < UF; ++Part) { 4082 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); 4083 VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); 4084 addMetadata(EntryPart, GEP); 4085 } 4086 } else { 4087 // If the GEP has at least one loop-varying operand, we are sure to 4088 // produce a vector of pointers. But if we are only unrolling, we want 4089 // to produce a scalar GEP for each unroll part. Thus, the GEP we 4090 // produce with the code below will be scalar (if VF == 1) or vector 4091 // (otherwise). Note that for the unroll-only case, we still maintain 4092 // values in the vector mapping with initVector, as we do for other 4093 // instructions. 4094 for (unsigned Part = 0; Part < UF; ++Part) { 4095 // The pointer operand of the new GEP. If it's loop-invariant, we 4096 // won't broadcast it. 4097 auto *Ptr = IsPtrLoopInvariant 4098 ? GEP->getPointerOperand() 4099 : getOrCreateVectorValue(GEP->getPointerOperand(), Part); 4100 4101 // Collect all the indices for the new GEP. If any index is 4102 // loop-invariant, we won't broadcast it. 4103 SmallVector<Value *, 4> Indices; 4104 for (auto Index : enumerate(GEP->indices())) { 4105 Value *User = Index.value().get(); 4106 if (IsIndexLoopInvariant[Index.index()]) 4107 Indices.push_back(User); 4108 else 4109 Indices.push_back(getOrCreateVectorValue(User, Part)); 4110 } 4111 4112 // Create the new GEP. Note that this GEP may be a scalar if VF == 1, 4113 // but it should be a vector, otherwise. 4114 auto *NewGEP = 4115 GEP->isInBounds() 4116 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, 4117 Indices) 4118 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); 4119 assert((VF == 1 || NewGEP->getType()->isVectorTy()) && 4120 "NewGEP is not a pointer vector"); 4121 VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); 4122 addMetadata(NewGEP, GEP); 4123 } 4124 } 4125 } 4126 4127 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, 4128 unsigned VF) { 4129 PHINode *P = cast<PHINode>(PN); 4130 if (EnableVPlanNativePath) { 4131 // Currently we enter here in the VPlan-native path for non-induction 4132 // PHIs where all control flow is uniform. We simply widen these PHIs. 4133 // Create a vector phi with no operands - the vector phi operands will be 4134 // set at the end of vector code generation. 4135 Type *VecTy = 4136 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4137 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi"); 4138 VectorLoopValueMap.setVectorValue(P, 0, VecPhi); 4139 OrigPHIsToFix.push_back(P); 4140 4141 return; 4142 } 4143 4144 assert(PN->getParent() == OrigLoop->getHeader() && 4145 "Non-header phis should have been handled elsewhere"); 4146 4147 // In order to support recurrences we need to be able to vectorize Phi nodes. 4148 // Phi nodes have cycles, so we need to vectorize them in two stages. This is 4149 // stage #1: We create a new vector PHI node with no incoming edges. We'll use 4150 // this value when we vectorize all of the instructions that use the PHI. 4151 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { 4152 for (unsigned Part = 0; Part < UF; ++Part) { 4153 // This is phase one of vectorizing PHIs. 4154 Type *VecTy = 4155 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); 4156 Value *EntryPart = PHINode::Create( 4157 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt()); 4158 VectorLoopValueMap.setVectorValue(P, Part, EntryPart); 4159 } 4160 return; 4161 } 4162 4163 setDebugLocFromInst(Builder, P); 4164 4165 // This PHINode must be an induction variable. 4166 // Make sure that we know about it. 4167 assert(Legal->getInductionVars().count(P) && "Not an induction variable"); 4168 4169 InductionDescriptor II = Legal->getInductionVars().lookup(P); 4170 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); 4171 4172 // FIXME: The newly created binary instructions should contain nsw/nuw flags, 4173 // which can be found from the original scalar operations. 4174 switch (II.getKind()) { 4175 case InductionDescriptor::IK_NoInduction: 4176 llvm_unreachable("Unknown induction"); 4177 case InductionDescriptor::IK_IntInduction: 4178 case InductionDescriptor::IK_FpInduction: 4179 llvm_unreachable("Integer/fp induction is handled elsewhere."); 4180 case InductionDescriptor::IK_PtrInduction: { 4181 // Handle the pointer induction variable case. 4182 assert(P->getType()->isPointerTy() && "Unexpected type."); 4183 // This is the normalized GEP that starts counting at zero. 4184 Value *PtrInd = Induction; 4185 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType()); 4186 // Determine the number of scalars we need to generate for each unroll 4187 // iteration. If the instruction is uniform, we only need to generate the 4188 // first lane. Otherwise, we generate all VF values. 4189 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF; 4190 // These are the scalar results. Notice that we don't generate vector GEPs 4191 // because scalar GEPs result in better code. 4192 for (unsigned Part = 0; Part < UF; ++Part) { 4193 for (unsigned Lane = 0; Lane < Lanes; ++Lane) { 4194 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF); 4195 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); 4196 Value *SclrGep = 4197 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); 4198 SclrGep->setName("next.gep"); 4199 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); 4200 } 4201 } 4202 return; 4203 } 4204 } 4205 } 4206 4207 /// A helper function for checking whether an integer division-related 4208 /// instruction may divide by zero (in which case it must be predicated if 4209 /// executed conditionally in the scalar code). 4210 /// TODO: It may be worthwhile to generalize and check isKnownNonZero(). 4211 /// Non-zero divisors that are non compile-time constants will not be 4212 /// converted into multiplication, so we will still end up scalarizing 4213 /// the division, but can do so w/o predication. 4214 static bool mayDivideByZero(Instruction &I) { 4215 assert((I.getOpcode() == Instruction::UDiv || 4216 I.getOpcode() == Instruction::SDiv || 4217 I.getOpcode() == Instruction::URem || 4218 I.getOpcode() == Instruction::SRem) && 4219 "Unexpected instruction"); 4220 Value *Divisor = I.getOperand(1); 4221 auto *CInt = dyn_cast<ConstantInt>(Divisor); 4222 return !CInt || CInt->isZero(); 4223 } 4224 4225 void InnerLoopVectorizer::widenInstruction(Instruction &I) { 4226 switch (I.getOpcode()) { 4227 case Instruction::Br: 4228 case Instruction::PHI: 4229 case Instruction::GetElementPtr: 4230 llvm_unreachable("This instruction is handled by a different recipe."); 4231 case Instruction::UDiv: 4232 case Instruction::SDiv: 4233 case Instruction::SRem: 4234 case Instruction::URem: 4235 case Instruction::Add: 4236 case Instruction::FAdd: 4237 case Instruction::Sub: 4238 case Instruction::FSub: 4239 case Instruction::FNeg: 4240 case Instruction::Mul: 4241 case Instruction::FMul: 4242 case Instruction::FDiv: 4243 case Instruction::FRem: 4244 case Instruction::Shl: 4245 case Instruction::LShr: 4246 case Instruction::AShr: 4247 case Instruction::And: 4248 case Instruction::Or: 4249 case Instruction::Xor: { 4250 // Just widen unops and binops. 4251 setDebugLocFromInst(Builder, &I); 4252 4253 for (unsigned Part = 0; Part < UF; ++Part) { 4254 SmallVector<Value *, 2> Ops; 4255 for (Value *Op : I.operands()) 4256 Ops.push_back(getOrCreateVectorValue(Op, Part)); 4257 4258 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops); 4259 4260 if (auto *VecOp = dyn_cast<Instruction>(V)) 4261 VecOp->copyIRFlags(&I); 4262 4263 // Use this vector value for all users of the original instruction. 4264 VectorLoopValueMap.setVectorValue(&I, Part, V); 4265 addMetadata(V, &I); 4266 } 4267 4268 break; 4269 } 4270 case Instruction::Select: { 4271 // Widen selects. 4272 // If the selector is loop invariant we can create a select 4273 // instruction with a scalar condition. Otherwise, use vector-select. 4274 auto *SE = PSE.getSE(); 4275 bool InvariantCond = 4276 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop); 4277 setDebugLocFromInst(Builder, &I); 4278 4279 // The condition can be loop invariant but still defined inside the 4280 // loop. This means that we can't just use the original 'cond' value. 4281 // We have to take the 'vectorized' value and pick the first lane. 4282 // Instcombine will make this a no-op. 4283 4284 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); 4285 4286 for (unsigned Part = 0; Part < UF; ++Part) { 4287 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); 4288 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part); 4289 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part); 4290 Value *Sel = 4291 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); 4292 VectorLoopValueMap.setVectorValue(&I, Part, Sel); 4293 addMetadata(Sel, &I); 4294 } 4295 4296 break; 4297 } 4298 4299 case Instruction::ICmp: 4300 case Instruction::FCmp: { 4301 // Widen compares. Generate vector compares. 4302 bool FCmp = (I.getOpcode() == Instruction::FCmp); 4303 auto *Cmp = cast<CmpInst>(&I); 4304 setDebugLocFromInst(Builder, Cmp); 4305 for (unsigned Part = 0; Part < UF; ++Part) { 4306 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); 4307 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part); 4308 Value *C = nullptr; 4309 if (FCmp) { 4310 // Propagate fast math flags. 4311 IRBuilder<>::FastMathFlagGuard FMFG(Builder); 4312 Builder.setFastMathFlags(Cmp->getFastMathFlags()); 4313 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B); 4314 } else { 4315 C = Builder.CreateICmp(Cmp->getPredicate(), A, B); 4316 } 4317 VectorLoopValueMap.setVectorValue(&I, Part, C); 4318 addMetadata(C, &I); 4319 } 4320 4321 break; 4322 } 4323 4324 case Instruction::ZExt: 4325 case Instruction::SExt: 4326 case Instruction::FPToUI: 4327 case Instruction::FPToSI: 4328 case Instruction::FPExt: 4329 case Instruction::PtrToInt: 4330 case Instruction::IntToPtr: 4331 case Instruction::SIToFP: 4332 case Instruction::UIToFP: 4333 case Instruction::Trunc: 4334 case Instruction::FPTrunc: 4335 case Instruction::BitCast: { 4336 auto *CI = cast<CastInst>(&I); 4337 setDebugLocFromInst(Builder, CI); 4338 4339 /// Vectorize casts. 4340 Type *DestTy = 4341 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); 4342 4343 for (unsigned Part = 0; Part < UF; ++Part) { 4344 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part); 4345 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy); 4346 VectorLoopValueMap.setVectorValue(&I, Part, Cast); 4347 addMetadata(Cast, &I); 4348 } 4349 break; 4350 } 4351 4352 case Instruction::Call: { 4353 // Ignore dbg intrinsics. 4354 if (isa<DbgInfoIntrinsic>(I)) 4355 break; 4356 setDebugLocFromInst(Builder, &I); 4357 4358 Module *M = I.getParent()->getParent()->getParent(); 4359 auto *CI = cast<CallInst>(&I); 4360 4361 SmallVector<Type *, 4> Tys; 4362 for (Value *ArgOperand : CI->arg_operands()) 4363 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); 4364 4365 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 4366 4367 // The flag shows whether we use Intrinsic or a usual Call for vectorized 4368 // version of the instruction. 4369 // Is it beneficial to perform intrinsic call compared to lib call? 4370 bool NeedToScalarize = false; 4371 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); 4372 bool UseVectorIntrinsic = 4373 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; 4374 assert((UseVectorIntrinsic || !NeedToScalarize) && 4375 "Instruction should be scalarized elsewhere."); 4376 4377 for (unsigned Part = 0; Part < UF; ++Part) { 4378 SmallVector<Value *, 4> Args; 4379 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 4380 Value *Arg = CI->getArgOperand(i); 4381 // Some intrinsics have a scalar argument - don't replace it with a 4382 // vector. 4383 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) 4384 Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); 4385 Args.push_back(Arg); 4386 } 4387 4388 Function *VectorF; 4389 if (UseVectorIntrinsic) { 4390 // Use vector version of the intrinsic. 4391 Type *TysForDecl[] = {CI->getType()}; 4392 if (VF > 1) 4393 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); 4394 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); 4395 } else { 4396 // Use vector version of the function call. 4397 const VFShape Shape = 4398 VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); 4399 #ifndef NDEBUG 4400 const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI); 4401 assert(std::find_if(Infos.begin(), Infos.end(), 4402 [&Shape](const VFInfo &Info) { 4403 return Info.Shape == Shape; 4404 }) != Infos.end() && 4405 "Vector function shape is missing from the database."); 4406 #endif 4407 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); 4408 } 4409 assert(VectorF && "Can't create vector function."); 4410 4411 SmallVector<OperandBundleDef, 1> OpBundles; 4412 CI->getOperandBundlesAsDefs(OpBundles); 4413 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles); 4414 4415 if (isa<FPMathOperator>(V)) 4416 V->copyFastMathFlags(CI); 4417 4418 VectorLoopValueMap.setVectorValue(&I, Part, V); 4419 addMetadata(V, &I); 4420 } 4421 4422 break; 4423 } 4424 4425 default: 4426 // This instruction is not vectorized by simple widening. 4427 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); 4428 llvm_unreachable("Unhandled instruction!"); 4429 } // end of switch. 4430 } 4431 4432 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { 4433 // We should not collect Scalars more than once per VF. Right now, this 4434 // function is called from collectUniformsAndScalars(), which already does 4435 // this check. Collecting Scalars for VF=1 does not make any sense. 4436 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() && 4437 "This function should not be visited twice for the same VF"); 4438 4439 SmallSetVector<Instruction *, 8> Worklist; 4440 4441 // These sets are used to seed the analysis with pointers used by memory 4442 // accesses that will remain scalar. 4443 SmallSetVector<Instruction *, 8> ScalarPtrs; 4444 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; 4445 4446 // A helper that returns true if the use of Ptr by MemAccess will be scalar. 4447 // The pointer operands of loads and stores will be scalar as long as the 4448 // memory access is not a gather or scatter operation. The value operand of a 4449 // store will remain scalar if the store is scalarized. 4450 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { 4451 InstWidening WideningDecision = getWideningDecision(MemAccess, VF); 4452 assert(WideningDecision != CM_Unknown && 4453 "Widening decision should be ready at this moment"); 4454 if (auto *Store = dyn_cast<StoreInst>(MemAccess)) 4455 if (Ptr == Store->getValueOperand()) 4456 return WideningDecision == CM_Scalarize; 4457 assert(Ptr == getLoadStorePointerOperand(MemAccess) && 4458 "Ptr is neither a value or pointer operand"); 4459 return WideningDecision != CM_GatherScatter; 4460 }; 4461 4462 // A helper that returns true if the given value is a bitcast or 4463 // getelementptr instruction contained in the loop. 4464 auto isLoopVaryingBitCastOrGEP = [&](Value *V) { 4465 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) || 4466 isa<GetElementPtrInst>(V)) && 4467 !TheLoop->isLoopInvariant(V); 4468 }; 4469 4470 // A helper that evaluates a memory access's use of a pointer. If the use 4471 // will be a scalar use, and the pointer is only used by memory accesses, we 4472 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in 4473 // PossibleNonScalarPtrs. 4474 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { 4475 // We only care about bitcast and getelementptr instructions contained in 4476 // the loop. 4477 if (!isLoopVaryingBitCastOrGEP(Ptr)) 4478 return; 4479 4480 // If the pointer has already been identified as scalar (e.g., if it was 4481 // also identified as uniform), there's nothing to do. 4482 auto *I = cast<Instruction>(Ptr); 4483 if (Worklist.count(I)) 4484 return; 4485 4486 // If the use of the pointer will be a scalar use, and all users of the 4487 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 4488 // place the pointer in PossibleNonScalarPtrs. 4489 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { 4490 return isa<LoadInst>(U) || isa<StoreInst>(U); 4491 })) 4492 ScalarPtrs.insert(I); 4493 else 4494 PossibleNonScalarPtrs.insert(I); 4495 }; 4496 4497 // We seed the scalars analysis with three classes of instructions: (1) 4498 // instructions marked uniform-after-vectorization, (2) bitcast and 4499 // getelementptr instructions used by memory accesses requiring a scalar use, 4500 // and (3) pointer induction variables and their update instructions (we 4501 // currently only scalarize these). 4502 // 4503 // (1) Add to the worklist all instructions that have been identified as 4504 // uniform-after-vectorization. 4505 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); 4506 4507 // (2) Add to the worklist all bitcast and getelementptr instructions used by 4508 // memory accesses requiring a scalar use. The pointer operands of loads and 4509 // stores will be scalar as long as the memory accesses is not a gather or 4510 // scatter operation. The value operand of a store will remain scalar if the 4511 // store is scalarized. 4512 for (auto *BB : TheLoop->blocks()) 4513 for (auto &I : *BB) { 4514 if (auto *Load = dyn_cast<LoadInst>(&I)) { 4515 evaluatePtrUse(Load, Load->getPointerOperand()); 4516 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 4517 evaluatePtrUse(Store, Store->getPointerOperand()); 4518 evaluatePtrUse(Store, Store->getValueOperand()); 4519 } 4520 } 4521 for (auto *I : ScalarPtrs) 4522 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) { 4523 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); 4524 Worklist.insert(I); 4525 } 4526 4527 // (3) Add to the worklist all pointer induction variables and their update 4528 // instructions. 4529 // 4530 // TODO: Once we are able to vectorize pointer induction variables we should 4531 // no longer insert them into the worklist here. 4532 auto *Latch = TheLoop->getLoopLatch(); 4533 for (auto &Induction : Legal->getInductionVars()) { 4534 auto *Ind = Induction.first; 4535 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4536 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) 4537 continue; 4538 Worklist.insert(Ind); 4539 Worklist.insert(IndUpdate); 4540 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4541 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4542 << "\n"); 4543 } 4544 4545 // Insert the forced scalars. 4546 // FIXME: Currently widenPHIInstruction() often creates a dead vector 4547 // induction variable when the PHI user is scalarized. 4548 auto ForcedScalar = ForcedScalars.find(VF); 4549 if (ForcedScalar != ForcedScalars.end()) 4550 for (auto *I : ForcedScalar->second) 4551 Worklist.insert(I); 4552 4553 // Expand the worklist by looking through any bitcasts and getelementptr 4554 // instructions we've already identified as scalar. This is similar to the 4555 // expansion step in collectLoopUniforms(); however, here we're only 4556 // expanding to include additional bitcasts and getelementptr instructions. 4557 unsigned Idx = 0; 4558 while (Idx != Worklist.size()) { 4559 Instruction *Dst = Worklist[Idx++]; 4560 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) 4561 continue; 4562 auto *Src = cast<Instruction>(Dst->getOperand(0)); 4563 if (llvm::all_of(Src->users(), [&](User *U) -> bool { 4564 auto *J = cast<Instruction>(U); 4565 return !TheLoop->contains(J) || Worklist.count(J) || 4566 ((isa<LoadInst>(J) || isa<StoreInst>(J)) && 4567 isScalarUse(J, Src)); 4568 })) { 4569 Worklist.insert(Src); 4570 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); 4571 } 4572 } 4573 4574 // An induction variable will remain scalar if all users of the induction 4575 // variable and induction variable update remain scalar. 4576 for (auto &Induction : Legal->getInductionVars()) { 4577 auto *Ind = Induction.first; 4578 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4579 4580 // We already considered pointer induction variables, so there's no reason 4581 // to look at their users again. 4582 // 4583 // TODO: Once we are able to vectorize pointer induction variables we 4584 // should no longer skip over them here. 4585 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) 4586 continue; 4587 4588 // Determine if all users of the induction variable are scalar after 4589 // vectorization. 4590 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4591 auto *I = cast<Instruction>(U); 4592 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); 4593 }); 4594 if (!ScalarInd) 4595 continue; 4596 4597 // Determine if all users of the induction variable update instruction are 4598 // scalar after vectorization. 4599 auto ScalarIndUpdate = 4600 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4601 auto *I = cast<Instruction>(U); 4602 return I == Ind || !TheLoop->contains(I) || Worklist.count(I); 4603 }); 4604 if (!ScalarIndUpdate) 4605 continue; 4606 4607 // The induction variable and its update instruction will remain scalar. 4608 Worklist.insert(Ind); 4609 Worklist.insert(IndUpdate); 4610 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); 4611 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate 4612 << "\n"); 4613 } 4614 4615 Scalars[VF].insert(Worklist.begin(), Worklist.end()); 4616 } 4617 4618 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { 4619 if (!blockNeedsPredication(I->getParent())) 4620 return false; 4621 switch(I->getOpcode()) { 4622 default: 4623 break; 4624 case Instruction::Load: 4625 case Instruction::Store: { 4626 if (!Legal->isMaskRequired(I)) 4627 return false; 4628 auto *Ptr = getLoadStorePointerOperand(I); 4629 auto *Ty = getMemInstValueType(I); 4630 // We have already decided how to vectorize this instruction, get that 4631 // result. 4632 if (VF > 1) { 4633 InstWidening WideningDecision = getWideningDecision(I, VF); 4634 assert(WideningDecision != CM_Unknown && 4635 "Widening decision should be ready at this moment"); 4636 return WideningDecision == CM_Scalarize; 4637 } 4638 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4639 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || 4640 isLegalMaskedGather(Ty, Alignment)) 4641 : !(isLegalMaskedStore(Ty, Ptr, Alignment) || 4642 isLegalMaskedScatter(Ty, Alignment)); 4643 } 4644 case Instruction::UDiv: 4645 case Instruction::SDiv: 4646 case Instruction::SRem: 4647 case Instruction::URem: 4648 return mayDivideByZero(*I); 4649 } 4650 return false; 4651 } 4652 4653 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, 4654 unsigned VF) { 4655 assert(isAccessInterleaved(I) && "Expecting interleaved access."); 4656 assert(getWideningDecision(I, VF) == CM_Unknown && 4657 "Decision should not be set yet."); 4658 auto *Group = getInterleavedAccessGroup(I); 4659 assert(Group && "Must have a group."); 4660 4661 // If the instruction's allocated size doesn't equal it's type size, it 4662 // requires padding and will be scalarized. 4663 auto &DL = I->getModule()->getDataLayout(); 4664 auto *ScalarTy = getMemInstValueType(I); 4665 if (hasIrregularType(ScalarTy, DL, VF)) 4666 return false; 4667 4668 // Check if masking is required. 4669 // A Group may need masking for one of two reasons: it resides in a block that 4670 // needs predication, or it was decided to use masking to deal with gaps. 4671 bool PredicatedAccessRequiresMasking = 4672 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); 4673 bool AccessWithGapsRequiresMasking = 4674 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 4675 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) 4676 return true; 4677 4678 // If masked interleaving is required, we expect that the user/target had 4679 // enabled it, because otherwise it either wouldn't have been created or 4680 // it should have been invalidated by the CostModel. 4681 assert(useMaskedInterleavedAccesses(TTI) && 4682 "Masked interleave-groups for predicated accesses are not enabled."); 4683 4684 auto *Ty = getMemInstValueType(I); 4685 const MaybeAlign Alignment = getLoadStoreAlignment(I); 4686 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) 4687 : TTI.isLegalMaskedStore(Ty, Alignment); 4688 } 4689 4690 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, 4691 unsigned VF) { 4692 // Get and ensure we have a valid memory instruction. 4693 LoadInst *LI = dyn_cast<LoadInst>(I); 4694 StoreInst *SI = dyn_cast<StoreInst>(I); 4695 assert((LI || SI) && "Invalid memory instruction"); 4696 4697 auto *Ptr = getLoadStorePointerOperand(I); 4698 4699 // In order to be widened, the pointer should be consecutive, first of all. 4700 if (!Legal->isConsecutivePtr(Ptr)) 4701 return false; 4702 4703 // If the instruction is a store located in a predicated block, it will be 4704 // scalarized. 4705 if (isScalarWithPredication(I)) 4706 return false; 4707 4708 // If the instruction's allocated size doesn't equal it's type size, it 4709 // requires padding and will be scalarized. 4710 auto &DL = I->getModule()->getDataLayout(); 4711 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); 4712 if (hasIrregularType(ScalarTy, DL, VF)) 4713 return false; 4714 4715 return true; 4716 } 4717 4718 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { 4719 // We should not collect Uniforms more than once per VF. Right now, 4720 // this function is called from collectUniformsAndScalars(), which 4721 // already does this check. Collecting Uniforms for VF=1 does not make any 4722 // sense. 4723 4724 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() && 4725 "This function should not be visited twice for the same VF"); 4726 4727 // Visit the list of Uniforms. If we'll not find any uniform value, we'll 4728 // not analyze again. Uniforms.count(VF) will return 1. 4729 Uniforms[VF].clear(); 4730 4731 // We now know that the loop is vectorizable! 4732 // Collect instructions inside the loop that will remain uniform after 4733 // vectorization. 4734 4735 // Global values, params and instructions outside of current loop are out of 4736 // scope. 4737 auto isOutOfScope = [&](Value *V) -> bool { 4738 Instruction *I = dyn_cast<Instruction>(V); 4739 return (!I || !TheLoop->contains(I)); 4740 }; 4741 4742 SetVector<Instruction *> Worklist; 4743 BasicBlock *Latch = TheLoop->getLoopLatch(); 4744 4745 // Instructions that are scalar with predication must not be considered 4746 // uniform after vectorization, because that would create an erroneous 4747 // replicating region where only a single instance out of VF should be formed. 4748 // TODO: optimize such seldom cases if found important, see PR40816. 4749 auto addToWorklistIfAllowed = [&](Instruction *I) -> void { 4750 if (isScalarWithPredication(I, VF)) { 4751 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " 4752 << *I << "\n"); 4753 return; 4754 } 4755 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); 4756 Worklist.insert(I); 4757 }; 4758 4759 // Start with the conditional branch. If the branch condition is an 4760 // instruction contained in the loop that is only used by the branch, it is 4761 // uniform. 4762 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 4763 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) 4764 addToWorklistIfAllowed(Cmp); 4765 4766 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers 4767 // are pointers that are treated like consecutive pointers during 4768 // vectorization. The pointer operands of interleaved accesses are an 4769 // example. 4770 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs; 4771 4772 // Holds pointer operands of instructions that are possibly non-uniform. 4773 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs; 4774 4775 auto isUniformDecision = [&](Instruction *I, unsigned VF) { 4776 InstWidening WideningDecision = getWideningDecision(I, VF); 4777 assert(WideningDecision != CM_Unknown && 4778 "Widening decision should be ready at this moment"); 4779 4780 return (WideningDecision == CM_Widen || 4781 WideningDecision == CM_Widen_Reverse || 4782 WideningDecision == CM_Interleave); 4783 }; 4784 // Iterate over the instructions in the loop, and collect all 4785 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible 4786 // that a consecutive-like pointer operand will be scalarized, we collect it 4787 // in PossibleNonUniformPtrs instead. We use two sets here because a single 4788 // getelementptr instruction can be used by both vectorized and scalarized 4789 // memory instructions. For example, if a loop loads and stores from the same 4790 // location, but the store is conditional, the store will be scalarized, and 4791 // the getelementptr won't remain uniform. 4792 for (auto *BB : TheLoop->blocks()) 4793 for (auto &I : *BB) { 4794 // If there's no pointer operand, there's nothing to do. 4795 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 4796 if (!Ptr) 4797 continue; 4798 4799 // True if all users of Ptr are memory accesses that have Ptr as their 4800 // pointer operand. 4801 auto UsersAreMemAccesses = 4802 llvm::all_of(Ptr->users(), [&](User *U) -> bool { 4803 return getLoadStorePointerOperand(U) == Ptr; 4804 }); 4805 4806 // Ensure the memory instruction will not be scalarized or used by 4807 // gather/scatter, making its pointer operand non-uniform. If the pointer 4808 // operand is used by any instruction other than a memory access, we 4809 // conservatively assume the pointer operand may be non-uniform. 4810 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF)) 4811 PossibleNonUniformPtrs.insert(Ptr); 4812 4813 // If the memory instruction will be vectorized and its pointer operand 4814 // is consecutive-like, or interleaving - the pointer operand should 4815 // remain uniform. 4816 else 4817 ConsecutiveLikePtrs.insert(Ptr); 4818 } 4819 4820 // Add to the Worklist all consecutive and consecutive-like pointers that 4821 // aren't also identified as possibly non-uniform. 4822 for (auto *V : ConsecutiveLikePtrs) 4823 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) 4824 addToWorklistIfAllowed(V); 4825 4826 // Expand Worklist in topological order: whenever a new instruction 4827 // is added , its users should be already inside Worklist. It ensures 4828 // a uniform instruction will only be used by uniform instructions. 4829 unsigned idx = 0; 4830 while (idx != Worklist.size()) { 4831 Instruction *I = Worklist[idx++]; 4832 4833 for (auto OV : I->operand_values()) { 4834 // isOutOfScope operands cannot be uniform instructions. 4835 if (isOutOfScope(OV)) 4836 continue; 4837 // First order recurrence Phi's should typically be considered 4838 // non-uniform. 4839 auto *OP = dyn_cast<PHINode>(OV); 4840 if (OP && Legal->isFirstOrderRecurrence(OP)) 4841 continue; 4842 // If all the users of the operand are uniform, then add the 4843 // operand into the uniform worklist. 4844 auto *OI = cast<Instruction>(OV); 4845 if (llvm::all_of(OI->users(), [&](User *U) -> bool { 4846 auto *J = cast<Instruction>(U); 4847 return Worklist.count(J) || 4848 (OI == getLoadStorePointerOperand(J) && 4849 isUniformDecision(J, VF)); 4850 })) 4851 addToWorklistIfAllowed(OI); 4852 } 4853 } 4854 4855 // Returns true if Ptr is the pointer operand of a memory access instruction 4856 // I, and I is known to not require scalarization. 4857 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { 4858 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); 4859 }; 4860 4861 // For an instruction to be added into Worklist above, all its users inside 4862 // the loop should also be in Worklist. However, this condition cannot be 4863 // true for phi nodes that form a cyclic dependence. We must process phi 4864 // nodes separately. An induction variable will remain uniform if all users 4865 // of the induction variable and induction variable update remain uniform. 4866 // The code below handles both pointer and non-pointer induction variables. 4867 for (auto &Induction : Legal->getInductionVars()) { 4868 auto *Ind = Induction.first; 4869 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 4870 4871 // Determine if all users of the induction variable are uniform after 4872 // vectorization. 4873 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { 4874 auto *I = cast<Instruction>(U); 4875 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || 4876 isVectorizedMemAccessUse(I, Ind); 4877 }); 4878 if (!UniformInd) 4879 continue; 4880 4881 // Determine if all users of the induction variable update instruction are 4882 // uniform after vectorization. 4883 auto UniformIndUpdate = 4884 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 4885 auto *I = cast<Instruction>(U); 4886 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || 4887 isVectorizedMemAccessUse(I, IndUpdate); 4888 }); 4889 if (!UniformIndUpdate) 4890 continue; 4891 4892 // The induction variable and its update instruction will remain uniform. 4893 addToWorklistIfAllowed(Ind); 4894 addToWorklistIfAllowed(IndUpdate); 4895 } 4896 4897 Uniforms[VF].insert(Worklist.begin(), Worklist.end()); 4898 } 4899 4900 bool LoopVectorizationCostModel::runtimeChecksRequired() { 4901 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); 4902 4903 if (Legal->getRuntimePointerChecking()->Need) { 4904 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", 4905 "runtime pointer checks needed. Enable vectorization of this " 4906 "loop with '#pragma clang loop vectorize(enable)' when " 4907 "compiling with -Os/-Oz", 4908 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4909 return true; 4910 } 4911 4912 if (!PSE.getUnionPredicate().getPredicates().empty()) { 4913 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", 4914 "runtime SCEV checks needed. Enable vectorization of this " 4915 "loop with '#pragma clang loop vectorize(enable)' when " 4916 "compiling with -Os/-Oz", 4917 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4918 return true; 4919 } 4920 4921 // FIXME: Avoid specializing for stride==1 instead of bailing out. 4922 if (!Legal->getLAI()->getSymbolicStrides().empty()) { 4923 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", 4924 "runtime stride == 1 checks needed. Enable vectorization of " 4925 "this loop with '#pragma clang loop vectorize(enable)' when " 4926 "compiling with -Os/-Oz", 4927 "CantVersionLoopWithOptForSize", ORE, TheLoop); 4928 return true; 4929 } 4930 4931 return false; 4932 } 4933 4934 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { 4935 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { 4936 // TODO: It may by useful to do since it's still likely to be dynamically 4937 // uniform if the target can skip. 4938 reportVectorizationFailure( 4939 "Not inserting runtime ptr check for divergent target", 4940 "runtime pointer checks needed. Not enabled for divergent target", 4941 "CantVersionLoopWithDivergentTarget", ORE, TheLoop); 4942 return None; 4943 } 4944 4945 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); 4946 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); 4947 if (TC == 1) { 4948 reportVectorizationFailure("Single iteration (non) loop", 4949 "loop trip count is one, irrelevant for vectorization", 4950 "SingleIterationLoop", ORE, TheLoop); 4951 return None; 4952 } 4953 4954 switch (ScalarEpilogueStatus) { 4955 case CM_ScalarEpilogueAllowed: 4956 return computeFeasibleMaxVF(TC); 4957 case CM_ScalarEpilogueNotNeededUsePredicate: 4958 LLVM_DEBUG( 4959 dbgs() << "LV: vector predicate hint/switch found.\n" 4960 << "LV: Not allowing scalar epilogue, creating predicated " 4961 << "vector loop.\n"); 4962 break; 4963 case CM_ScalarEpilogueNotAllowedLowTripLoop: 4964 // fallthrough as a special case of OptForSize 4965 case CM_ScalarEpilogueNotAllowedOptSize: 4966 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) 4967 LLVM_DEBUG( 4968 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); 4969 else 4970 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " 4971 << "count.\n"); 4972 4973 // Bail if runtime checks are required, which are not good when optimising 4974 // for size. 4975 if (runtimeChecksRequired()) 4976 return None; 4977 break; 4978 } 4979 4980 // Now try the tail folding 4981 4982 // Invalidate interleave groups that require an epilogue if we can't mask 4983 // the interleave-group. 4984 if (!useMaskedInterleavedAccesses(TTI)) 4985 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); 4986 4987 unsigned MaxVF = computeFeasibleMaxVF(TC); 4988 if (TC > 0 && TC % MaxVF == 0) { 4989 // Accept MaxVF if we do not have a tail. 4990 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); 4991 return MaxVF; 4992 } 4993 4994 // If we don't know the precise trip count, or if the trip count that we 4995 // found modulo the vectorization factor is not zero, try to fold the tail 4996 // by masking. 4997 // FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4998 if (Legal->prepareToFoldTailByMasking()) { 4999 FoldTailByMasking = true; 5000 return MaxVF; 5001 } 5002 5003 if (TC == 0) { 5004 reportVectorizationFailure( 5005 "Unable to calculate the loop count due to complex control flow", 5006 "unable to calculate the loop count due to complex control flow", 5007 "UnknownLoopCountComplexCFG", ORE, TheLoop); 5008 return None; 5009 } 5010 5011 reportVectorizationFailure( 5012 "Cannot optimize for size and vectorize at the same time.", 5013 "cannot optimize for size and vectorize at the same time. " 5014 "Enable vectorization of this loop with '#pragma clang loop " 5015 "vectorize(enable)' when compiling with -Os/-Oz", 5016 "NoTailLoopWithOptForSize", ORE, TheLoop); 5017 return None; 5018 } 5019 5020 unsigned 5021 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { 5022 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); 5023 unsigned SmallestType, WidestType; 5024 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); 5025 unsigned WidestRegister = TTI.getRegisterBitWidth(true); 5026 5027 // Get the maximum safe dependence distance in bits computed by LAA. 5028 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 5029 // the memory accesses that is most restrictive (involved in the smallest 5030 // dependence distance). 5031 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth(); 5032 5033 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth); 5034 5035 unsigned MaxVectorSize = WidestRegister / WidestType; 5036 5037 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType 5038 << " / " << WidestType << " bits.\n"); 5039 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " 5040 << WidestRegister << " bits.\n"); 5041 5042 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements" 5043 " into one vector!"); 5044 if (MaxVectorSize == 0) { 5045 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n"); 5046 MaxVectorSize = 1; 5047 return MaxVectorSize; 5048 } else if (ConstTripCount && ConstTripCount < MaxVectorSize && 5049 isPowerOf2_32(ConstTripCount)) { 5050 // We need to clamp the VF to be the ConstTripCount. There is no point in 5051 // choosing a higher viable VF as done in the loop below. 5052 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " 5053 << ConstTripCount << "\n"); 5054 MaxVectorSize = ConstTripCount; 5055 return MaxVectorSize; 5056 } 5057 5058 unsigned MaxVF = MaxVectorSize; 5059 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || 5060 (MaximizeBandwidth && isScalarEpilogueAllowed())) { 5061 // Collect all viable vectorization factors larger than the default MaxVF 5062 // (i.e. MaxVectorSize). 5063 SmallVector<unsigned, 8> VFs; 5064 unsigned NewMaxVectorSize = WidestRegister / SmallestType; 5065 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2) 5066 VFs.push_back(VS); 5067 5068 // For each VF calculate its register usage. 5069 auto RUs = calculateRegisterUsage(VFs); 5070 5071 // Select the largest VF which doesn't require more registers than existing 5072 // ones. 5073 for (int i = RUs.size() - 1; i >= 0; --i) { 5074 bool Selected = true; 5075 for (auto& pair : RUs[i].MaxLocalUsers) { 5076 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5077 if (pair.second > TargetNumRegisters) 5078 Selected = false; 5079 } 5080 if (Selected) { 5081 MaxVF = VFs[i]; 5082 break; 5083 } 5084 } 5085 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) { 5086 if (MaxVF < MinVF) { 5087 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF 5088 << ") with target's minimum: " << MinVF << '\n'); 5089 MaxVF = MinVF; 5090 } 5091 } 5092 } 5093 return MaxVF; 5094 } 5095 5096 VectorizationFactor 5097 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { 5098 float Cost = expectedCost(1).first; 5099 const float ScalarCost = Cost; 5100 unsigned Width = 1; 5101 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); 5102 5103 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; 5104 if (ForceVectorization && MaxVF > 1) { 5105 // Ignore scalar width, because the user explicitly wants vectorization. 5106 // Initialize cost to max so that VF = 2 is, at least, chosen during cost 5107 // evaluation. 5108 Cost = std::numeric_limits<float>::max(); 5109 } 5110 5111 for (unsigned i = 2; i <= MaxVF; i *= 2) { 5112 // Notice that the vector loop needs to be executed less times, so 5113 // we need to divide the cost of the vector loops by the width of 5114 // the vector elements. 5115 VectorizationCostTy C = expectedCost(i); 5116 float VectorCost = C.first / (float)i; 5117 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i 5118 << " costs: " << (int)VectorCost << ".\n"); 5119 if (!C.second && !ForceVectorization) { 5120 LLVM_DEBUG( 5121 dbgs() << "LV: Not considering vector loop of width " << i 5122 << " because it will not generate any vector instructions.\n"); 5123 continue; 5124 } 5125 if (VectorCost < Cost) { 5126 Cost = VectorCost; 5127 Width = i; 5128 } 5129 } 5130 5131 if (!EnableCondStoresVectorization && NumPredStores) { 5132 reportVectorizationFailure("There are conditional stores.", 5133 "store that is conditionally executed prevents vectorization", 5134 "ConditionalStore", ORE, TheLoop); 5135 Width = 1; 5136 Cost = ScalarCost; 5137 } 5138 5139 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() 5140 << "LV: Vectorization seems to be not beneficial, " 5141 << "but was forced by a user.\n"); 5142 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); 5143 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)}; 5144 return Factor; 5145 } 5146 5147 std::pair<unsigned, unsigned> 5148 LoopVectorizationCostModel::getSmallestAndWidestTypes() { 5149 unsigned MinWidth = -1U; 5150 unsigned MaxWidth = 8; 5151 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5152 5153 // For each block. 5154 for (BasicBlock *BB : TheLoop->blocks()) { 5155 // For each instruction in the loop. 5156 for (Instruction &I : BB->instructionsWithoutDebug()) { 5157 Type *T = I.getType(); 5158 5159 // Skip ignored values. 5160 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end()) 5161 continue; 5162 5163 // Only examine Loads, Stores and PHINodes. 5164 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I)) 5165 continue; 5166 5167 // Examine PHI nodes that are reduction variables. Update the type to 5168 // account for the recurrence type. 5169 if (auto *PN = dyn_cast<PHINode>(&I)) { 5170 if (!Legal->isReductionVariable(PN)) 5171 continue; 5172 RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; 5173 T = RdxDesc.getRecurrenceType(); 5174 } 5175 5176 // Examine the stored values. 5177 if (auto *ST = dyn_cast<StoreInst>(&I)) 5178 T = ST->getValueOperand()->getType(); 5179 5180 // Ignore loaded pointer types and stored pointer types that are not 5181 // vectorizable. 5182 // 5183 // FIXME: The check here attempts to predict whether a load or store will 5184 // be vectorized. We only know this for certain after a VF has 5185 // been selected. Here, we assume that if an access can be 5186 // vectorized, it will be. We should also look at extending this 5187 // optimization to non-pointer types. 5188 // 5189 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && 5190 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) 5191 continue; 5192 5193 MinWidth = std::min(MinWidth, 5194 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5195 MaxWidth = std::max(MaxWidth, 5196 (unsigned)DL.getTypeSizeInBits(T->getScalarType())); 5197 } 5198 } 5199 5200 return {MinWidth, MaxWidth}; 5201 } 5202 5203 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, 5204 unsigned LoopCost) { 5205 // -- The interleave heuristics -- 5206 // We interleave the loop in order to expose ILP and reduce the loop overhead. 5207 // There are many micro-architectural considerations that we can't predict 5208 // at this level. For example, frontend pressure (on decode or fetch) due to 5209 // code size, or the number and capabilities of the execution ports. 5210 // 5211 // We use the following heuristics to select the interleave count: 5212 // 1. If the code has reductions, then we interleave to break the cross 5213 // iteration dependency. 5214 // 2. If the loop is really small, then we interleave to reduce the loop 5215 // overhead. 5216 // 3. We don't interleave if we think that we will spill registers to memory 5217 // due to the increased register pressure. 5218 5219 if (!isScalarEpilogueAllowed()) 5220 return 1; 5221 5222 // We used the distance for the interleave count. 5223 if (Legal->getMaxSafeDepDistBytes() != -1U) 5224 return 1; 5225 5226 // Do not interleave loops with a relatively small known or estimated trip 5227 // count. 5228 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); 5229 if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) 5230 return 1; 5231 5232 RegisterUsage R = calculateRegisterUsage({VF})[0]; 5233 // We divide by these constants so assume that we have at least one 5234 // instruction that uses at least one register. 5235 for (auto& pair : R.MaxLocalUsers) { 5236 pair.second = std::max(pair.second, 1U); 5237 } 5238 5239 // We calculate the interleave count using the following formula. 5240 // Subtract the number of loop invariants from the number of available 5241 // registers. These registers are used by all of the interleaved instances. 5242 // Next, divide the remaining registers by the number of registers that is 5243 // required by the loop, in order to estimate how many parallel instances 5244 // fit without causing spills. All of this is rounded down if necessary to be 5245 // a power of two. We want power of two interleave count to simplify any 5246 // addressing operations or alignment considerations. 5247 // We also want power of two interleave counts to ensure that the induction 5248 // variable of the vector loop wraps to zero, when tail is folded by masking; 5249 // this currently happens when OptForSize, in which case IC is set to 1 above. 5250 unsigned IC = UINT_MAX; 5251 5252 for (auto& pair : R.MaxLocalUsers) { 5253 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); 5254 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters 5255 << " registers of " 5256 << TTI.getRegisterClassName(pair.first) << " register class\n"); 5257 if (VF == 1) { 5258 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) 5259 TargetNumRegisters = ForceTargetNumScalarRegs; 5260 } else { 5261 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) 5262 TargetNumRegisters = ForceTargetNumVectorRegs; 5263 } 5264 unsigned MaxLocalUsers = pair.second; 5265 unsigned LoopInvariantRegs = 0; 5266 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) 5267 LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; 5268 5269 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); 5270 // Don't count the induction variable as interleaved. 5271 if (EnableIndVarRegisterHeur) { 5272 TmpIC = 5273 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / 5274 std::max(1U, (MaxLocalUsers - 1))); 5275 } 5276 5277 IC = std::min(IC, TmpIC); 5278 } 5279 5280 // Clamp the interleave ranges to reasonable counts. 5281 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); 5282 5283 // Check if the user has overridden the max. 5284 if (VF == 1) { 5285 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) 5286 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; 5287 } else { 5288 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) 5289 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; 5290 } 5291 5292 // If trip count is known or estimated compile time constant, limit the 5293 // interleave count to be less than the trip count divided by VF. 5294 if (BestKnownTC) { 5295 MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); 5296 } 5297 5298 // If we did not calculate the cost for VF (because the user selected the VF) 5299 // then we calculate the cost of VF here. 5300 if (LoopCost == 0) 5301 LoopCost = expectedCost(VF).first; 5302 5303 assert(LoopCost && "Non-zero loop cost expected"); 5304 5305 // Clamp the calculated IC to be between the 1 and the max interleave count 5306 // that the target and trip count allows. 5307 if (IC > MaxInterleaveCount) 5308 IC = MaxInterleaveCount; 5309 else if (IC < 1) 5310 IC = 1; 5311 5312 // Interleave if we vectorized this loop and there is a reduction that could 5313 // benefit from interleaving. 5314 if (VF > 1 && !Legal->getReductionVars().empty()) { 5315 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); 5316 return IC; 5317 } 5318 5319 // Note that if we've already vectorized the loop we will have done the 5320 // runtime check and so interleaving won't require further checks. 5321 bool InterleavingRequiresRuntimePointerCheck = 5322 (VF == 1 && Legal->getRuntimePointerChecking()->Need); 5323 5324 // We want to interleave small loops in order to reduce the loop overhead and 5325 // potentially expose ILP opportunities. 5326 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); 5327 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { 5328 // We assume that the cost overhead is 1 and we use the cost model 5329 // to estimate the cost of the loop and interleave until the cost of the 5330 // loop overhead is about 5% of the cost of the loop. 5331 unsigned SmallIC = 5332 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); 5333 5334 // Interleave until store/load ports (estimated by max interleave count) are 5335 // saturated. 5336 unsigned NumStores = Legal->getNumStores(); 5337 unsigned NumLoads = Legal->getNumLoads(); 5338 unsigned StoresIC = IC / (NumStores ? NumStores : 1); 5339 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); 5340 5341 // If we have a scalar reduction (vector reductions are already dealt with 5342 // by this point), we can increase the critical path length if the loop 5343 // we're interleaving is inside another loop. Limit, by default to 2, so the 5344 // critical path only gets increased by one reduction operation. 5345 if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) { 5346 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); 5347 SmallIC = std::min(SmallIC, F); 5348 StoresIC = std::min(StoresIC, F); 5349 LoadsIC = std::min(LoadsIC, F); 5350 } 5351 5352 if (EnableLoadStoreRuntimeInterleave && 5353 std::max(StoresIC, LoadsIC) > SmallIC) { 5354 LLVM_DEBUG( 5355 dbgs() << "LV: Interleaving to saturate store or load ports.\n"); 5356 return std::max(StoresIC, LoadsIC); 5357 } 5358 5359 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); 5360 return SmallIC; 5361 } 5362 5363 // Interleave if this is a large loop (small loops are already dealt with by 5364 // this point) that could benefit from interleaving. 5365 bool HasReductions = !Legal->getReductionVars().empty(); 5366 if (TTI.enableAggressiveInterleaving(HasReductions)) { 5367 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); 5368 return IC; 5369 } 5370 5371 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n"); 5372 return 1; 5373 } 5374 5375 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> 5376 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { 5377 // This function calculates the register usage by measuring the highest number 5378 // of values that are alive at a single location. Obviously, this is a very 5379 // rough estimation. We scan the loop in a topological order in order and 5380 // assign a number to each instruction. We use RPO to ensure that defs are 5381 // met before their users. We assume that each instruction that has in-loop 5382 // users starts an interval. We record every time that an in-loop value is 5383 // used, so we have a list of the first and last occurrences of each 5384 // instruction. Next, we transpose this data structure into a multi map that 5385 // holds the list of intervals that *end* at a specific location. This multi 5386 // map allows us to perform a linear search. We scan the instructions linearly 5387 // and record each time that a new interval starts, by placing it in a set. 5388 // If we find this value in the multi-map then we remove it from the set. 5389 // The max register usage is the maximum size of the set. 5390 // We also search for instructions that are defined outside the loop, but are 5391 // used inside the loop. We need this number separately from the max-interval 5392 // usage number because when we unroll, loop-invariant values do not take 5393 // more register. 5394 LoopBlocksDFS DFS(TheLoop); 5395 DFS.perform(LI); 5396 5397 RegisterUsage RU; 5398 5399 // Each 'key' in the map opens a new interval. The values 5400 // of the map are the index of the 'last seen' usage of the 5401 // instruction that is the key. 5402 using IntervalMap = DenseMap<Instruction *, unsigned>; 5403 5404 // Maps instruction to its index. 5405 SmallVector<Instruction *, 64> IdxToInstr; 5406 // Marks the end of each interval. 5407 IntervalMap EndPoint; 5408 // Saves the list of instruction indices that are used in the loop. 5409 SmallPtrSet<Instruction *, 8> Ends; 5410 // Saves the list of values that are used in the loop but are 5411 // defined outside the loop, such as arguments and constants. 5412 SmallPtrSet<Value *, 8> LoopInvariants; 5413 5414 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 5415 for (Instruction &I : BB->instructionsWithoutDebug()) { 5416 IdxToInstr.push_back(&I); 5417 5418 // Save the end location of each USE. 5419 for (Value *U : I.operands()) { 5420 auto *Instr = dyn_cast<Instruction>(U); 5421 5422 // Ignore non-instruction values such as arguments, constants, etc. 5423 if (!Instr) 5424 continue; 5425 5426 // If this instruction is outside the loop then record it and continue. 5427 if (!TheLoop->contains(Instr)) { 5428 LoopInvariants.insert(Instr); 5429 continue; 5430 } 5431 5432 // Overwrite previous end points. 5433 EndPoint[Instr] = IdxToInstr.size(); 5434 Ends.insert(Instr); 5435 } 5436 } 5437 } 5438 5439 // Saves the list of intervals that end with the index in 'key'. 5440 using InstrList = SmallVector<Instruction *, 2>; 5441 DenseMap<unsigned, InstrList> TransposeEnds; 5442 5443 // Transpose the EndPoints to a list of values that end at each index. 5444 for (auto &Interval : EndPoint) 5445 TransposeEnds[Interval.second].push_back(Interval.first); 5446 5447 SmallPtrSet<Instruction *, 8> OpenIntervals; 5448 5449 // Get the size of the widest register. 5450 unsigned MaxSafeDepDist = -1U; 5451 if (Legal->getMaxSafeDepDistBytes() != -1U) 5452 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; 5453 unsigned WidestRegister = 5454 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); 5455 const DataLayout &DL = TheFunction->getParent()->getDataLayout(); 5456 5457 SmallVector<RegisterUsage, 8> RUs(VFs.size()); 5458 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); 5459 5460 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); 5461 5462 // A lambda that gets the register usage for the given type and VF. 5463 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { 5464 if (Ty->isTokenTy()) 5465 return 0U; 5466 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); 5467 return std::max<unsigned>(1, VF * TypeSize / WidestRegister); 5468 }; 5469 5470 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { 5471 Instruction *I = IdxToInstr[i]; 5472 5473 // Remove all of the instructions that end at this location. 5474 InstrList &List = TransposeEnds[i]; 5475 for (Instruction *ToRemove : List) 5476 OpenIntervals.erase(ToRemove); 5477 5478 // Ignore instructions that are never used within the loop. 5479 if (Ends.find(I) == Ends.end()) 5480 continue; 5481 5482 // Skip ignored values. 5483 if (ValuesToIgnore.find(I) != ValuesToIgnore.end()) 5484 continue; 5485 5486 // For each VF find the maximum usage of registers. 5487 for (unsigned j = 0, e = VFs.size(); j < e; ++j) { 5488 // Count the number of live intervals. 5489 SmallMapVector<unsigned, unsigned, 4> RegUsage; 5490 5491 if (VFs[j] == 1) { 5492 for (auto Inst : OpenIntervals) { 5493 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5494 if (RegUsage.find(ClassID) == RegUsage.end()) 5495 RegUsage[ClassID] = 1; 5496 else 5497 RegUsage[ClassID] += 1; 5498 } 5499 } else { 5500 collectUniformsAndScalars(VFs[j]); 5501 for (auto Inst : OpenIntervals) { 5502 // Skip ignored values for VF > 1. 5503 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) 5504 continue; 5505 if (isScalarAfterVectorization(Inst, VFs[j])) { 5506 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); 5507 if (RegUsage.find(ClassID) == RegUsage.end()) 5508 RegUsage[ClassID] = 1; 5509 else 5510 RegUsage[ClassID] += 1; 5511 } else { 5512 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); 5513 if (RegUsage.find(ClassID) == RegUsage.end()) 5514 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); 5515 else 5516 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); 5517 } 5518 } 5519 } 5520 5521 for (auto& pair : RegUsage) { 5522 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) 5523 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); 5524 else 5525 MaxUsages[j][pair.first] = pair.second; 5526 } 5527 } 5528 5529 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " 5530 << OpenIntervals.size() << '\n'); 5531 5532 // Add the current instruction to the list of open intervals. 5533 OpenIntervals.insert(I); 5534 } 5535 5536 for (unsigned i = 0, e = VFs.size(); i < e; ++i) { 5537 SmallMapVector<unsigned, unsigned, 4> Invariant; 5538 5539 for (auto Inst : LoopInvariants) { 5540 unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); 5541 unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); 5542 if (Invariant.find(ClassID) == Invariant.end()) 5543 Invariant[ClassID] = Usage; 5544 else 5545 Invariant[ClassID] += Usage; 5546 } 5547 5548 LLVM_DEBUG({ 5549 dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; 5550 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() 5551 << " item\n"; 5552 for (const auto &pair : MaxUsages[i]) { 5553 dbgs() << "LV(REG): RegisterClass: " 5554 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5555 << " registers\n"; 5556 } 5557 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() 5558 << " item\n"; 5559 for (const auto &pair : Invariant) { 5560 dbgs() << "LV(REG): RegisterClass: " 5561 << TTI.getRegisterClassName(pair.first) << ", " << pair.second 5562 << " registers\n"; 5563 } 5564 }); 5565 5566 RU.LoopInvariantRegs = Invariant; 5567 RU.MaxLocalUsers = MaxUsages[i]; 5568 RUs[i] = RU; 5569 } 5570 5571 return RUs; 5572 } 5573 5574 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ 5575 // TODO: Cost model for emulated masked load/store is completely 5576 // broken. This hack guides the cost model to use an artificially 5577 // high enough value to practically disable vectorization with such 5578 // operations, except where previously deployed legality hack allowed 5579 // using very low cost values. This is to avoid regressions coming simply 5580 // from moving "masked load/store" check from legality to cost model. 5581 // Masked Load/Gather emulation was previously never allowed. 5582 // Limited number of Masked Store/Scatter emulation was allowed. 5583 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); 5584 return isa<LoadInst>(I) || 5585 (isa<StoreInst>(I) && 5586 NumPredStores > NumberOfStoresToPredicate); 5587 } 5588 5589 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { 5590 // If we aren't vectorizing the loop, or if we've already collected the 5591 // instructions to scalarize, there's nothing to do. Collection may already 5592 // have occurred if we have a user-selected VF and are now computing the 5593 // expected cost for interleaving. 5594 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end()) 5595 return; 5596 5597 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5598 // not profitable to scalarize any instructions, the presence of VF in the 5599 // map will indicate that we've analyzed it already. 5600 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; 5601 5602 // Find all the instructions that are scalar with predication in the loop and 5603 // determine if it would be better to not if-convert the blocks they are in. 5604 // If so, we also record the instructions to scalarize. 5605 for (BasicBlock *BB : TheLoop->blocks()) { 5606 if (!blockNeedsPredication(BB)) 5607 continue; 5608 for (Instruction &I : *BB) 5609 if (isScalarWithPredication(&I)) { 5610 ScalarCostsTy ScalarCosts; 5611 // Do not apply discount logic if hacked cost is needed 5612 // for emulated masked memrefs. 5613 if (!useEmulatedMaskMemRefHack(&I) && 5614 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) 5615 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); 5616 // Remember that BB will remain after vectorization. 5617 PredicatedBBsAfterVectorization.insert(BB); 5618 } 5619 } 5620 } 5621 5622 int LoopVectorizationCostModel::computePredInstDiscount( 5623 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts, 5624 unsigned VF) { 5625 assert(!isUniformAfterVectorization(PredInst, VF) && 5626 "Instruction marked uniform-after-vectorization will be predicated"); 5627 5628 // Initialize the discount to zero, meaning that the scalar version and the 5629 // vector version cost the same. 5630 int Discount = 0; 5631 5632 // Holds instructions to analyze. The instructions we visit are mapped in 5633 // ScalarCosts. Those instructions are the ones that would be scalarized if 5634 // we find that the scalar version costs less. 5635 SmallVector<Instruction *, 8> Worklist; 5636 5637 // Returns true if the given instruction can be scalarized. 5638 auto canBeScalarized = [&](Instruction *I) -> bool { 5639 // We only attempt to scalarize instructions forming a single-use chain 5640 // from the original predicated block that would otherwise be vectorized. 5641 // Although not strictly necessary, we give up on instructions we know will 5642 // already be scalar to avoid traversing chains that are unlikely to be 5643 // beneficial. 5644 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || 5645 isScalarAfterVectorization(I, VF)) 5646 return false; 5647 5648 // If the instruction is scalar with predication, it will be analyzed 5649 // separately. We ignore it within the context of PredInst. 5650 if (isScalarWithPredication(I)) 5651 return false; 5652 5653 // If any of the instruction's operands are uniform after vectorization, 5654 // the instruction cannot be scalarized. This prevents, for example, a 5655 // masked load from being scalarized. 5656 // 5657 // We assume we will only emit a value for lane zero of an instruction 5658 // marked uniform after vectorization, rather than VF identical values. 5659 // Thus, if we scalarize an instruction that uses a uniform, we would 5660 // create uses of values corresponding to the lanes we aren't emitting code 5661 // for. This behavior can be changed by allowing getScalarValue to clone 5662 // the lane zero values for uniforms rather than asserting. 5663 for (Use &U : I->operands()) 5664 if (auto *J = dyn_cast<Instruction>(U.get())) 5665 if (isUniformAfterVectorization(J, VF)) 5666 return false; 5667 5668 // Otherwise, we can scalarize the instruction. 5669 return true; 5670 }; 5671 5672 // Compute the expected cost discount from scalarizing the entire expression 5673 // feeding the predicated instruction. We currently only consider expressions 5674 // that are single-use instruction chains. 5675 Worklist.push_back(PredInst); 5676 while (!Worklist.empty()) { 5677 Instruction *I = Worklist.pop_back_val(); 5678 5679 // If we've already analyzed the instruction, there's nothing to do. 5680 if (ScalarCosts.find(I) != ScalarCosts.end()) 5681 continue; 5682 5683 // Compute the cost of the vector instruction. Note that this cost already 5684 // includes the scalarization overhead of the predicated instruction. 5685 unsigned VectorCost = getInstructionCost(I, VF).first; 5686 5687 // Compute the cost of the scalarized instruction. This cost is the cost of 5688 // the instruction as if it wasn't if-converted and instead remained in the 5689 // predicated block. We will scale this cost by block probability after 5690 // computing the scalarization overhead. 5691 unsigned ScalarCost = VF * getInstructionCost(I, 1).first; 5692 5693 // Compute the scalarization overhead of needed insertelement instructions 5694 // and phi nodes. 5695 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { 5696 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), 5697 true, false); 5698 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); 5699 } 5700 5701 // Compute the scalarization overhead of needed extractelement 5702 // instructions. For each of the instruction's operands, if the operand can 5703 // be scalarized, add it to the worklist; otherwise, account for the 5704 // overhead. 5705 for (Use &U : I->operands()) 5706 if (auto *J = dyn_cast<Instruction>(U.get())) { 5707 assert(VectorType::isValidElementType(J->getType()) && 5708 "Instruction has non-scalar type"); 5709 if (canBeScalarized(J)) 5710 Worklist.push_back(J); 5711 else if (needsExtract(J, VF)) 5712 ScalarCost += TTI.getScalarizationOverhead( 5713 ToVectorTy(J->getType(),VF), false, true); 5714 } 5715 5716 // Scale the total scalar cost by block probability. 5717 ScalarCost /= getReciprocalPredBlockProb(); 5718 5719 // Compute the discount. A non-negative discount means the vector version 5720 // of the instruction costs more, and scalarizing would be beneficial. 5721 Discount += VectorCost - ScalarCost; 5722 ScalarCosts[I] = ScalarCost; 5723 } 5724 5725 return Discount; 5726 } 5727 5728 LoopVectorizationCostModel::VectorizationCostTy 5729 LoopVectorizationCostModel::expectedCost(unsigned VF) { 5730 VectorizationCostTy Cost; 5731 5732 // For each block. 5733 for (BasicBlock *BB : TheLoop->blocks()) { 5734 VectorizationCostTy BlockCost; 5735 5736 // For each instruction in the old loop. 5737 for (Instruction &I : BB->instructionsWithoutDebug()) { 5738 // Skip ignored values. 5739 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() || 5740 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end())) 5741 continue; 5742 5743 VectorizationCostTy C = getInstructionCost(&I, VF); 5744 5745 // Check if we should override the cost. 5746 if (ForceTargetInstructionCost.getNumOccurrences() > 0) 5747 C.first = ForceTargetInstructionCost; 5748 5749 BlockCost.first += C.first; 5750 BlockCost.second |= C.second; 5751 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first 5752 << " for VF " << VF << " For instruction: " << I 5753 << '\n'); 5754 } 5755 5756 // If we are vectorizing a predicated block, it will have been 5757 // if-converted. This means that the block's instructions (aside from 5758 // stores and instructions that may divide by zero) will now be 5759 // unconditionally executed. For the scalar case, we may not always execute 5760 // the predicated block. Thus, scale the block's cost by the probability of 5761 // executing it. 5762 if (VF == 1 && blockNeedsPredication(BB)) 5763 BlockCost.first /= getReciprocalPredBlockProb(); 5764 5765 Cost.first += BlockCost.first; 5766 Cost.second |= BlockCost.second; 5767 } 5768 5769 return Cost; 5770 } 5771 5772 /// Gets Address Access SCEV after verifying that the access pattern 5773 /// is loop invariant except the induction variable dependence. 5774 /// 5775 /// This SCEV can be sent to the Target in order to estimate the address 5776 /// calculation cost. 5777 static const SCEV *getAddressAccessSCEV( 5778 Value *Ptr, 5779 LoopVectorizationLegality *Legal, 5780 PredicatedScalarEvolution &PSE, 5781 const Loop *TheLoop) { 5782 5783 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr); 5784 if (!Gep) 5785 return nullptr; 5786 5787 // We are looking for a gep with all loop invariant indices except for one 5788 // which should be an induction variable. 5789 auto SE = PSE.getSE(); 5790 unsigned NumOperands = Gep->getNumOperands(); 5791 for (unsigned i = 1; i < NumOperands; ++i) { 5792 Value *Opd = Gep->getOperand(i); 5793 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && 5794 !Legal->isInductionVariable(Opd)) 5795 return nullptr; 5796 } 5797 5798 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5799 return PSE.getSCEV(Ptr); 5800 } 5801 5802 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { 5803 return Legal->hasStride(I->getOperand(0)) || 5804 Legal->hasStride(I->getOperand(1)); 5805 } 5806 5807 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, 5808 unsigned VF) { 5809 assert(VF > 1 && "Scalarization cost of instruction implies vectorization."); 5810 Type *ValTy = getMemInstValueType(I); 5811 auto SE = PSE.getSE(); 5812 5813 unsigned AS = getLoadStoreAddressSpace(I); 5814 Value *Ptr = getLoadStorePointerOperand(I); 5815 Type *PtrTy = ToVectorTy(Ptr->getType(), VF); 5816 5817 // Figure out whether the access is strided and get the stride value 5818 // if it's known in compile time 5819 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); 5820 5821 // Get the cost of the scalar memory instruction and address computation. 5822 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); 5823 5824 // Don't pass *I here, since it is scalar but will actually be part of a 5825 // vectorized loop where the user of it is a vectorized instruction. 5826 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5827 Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), 5828 Alignment, AS); 5829 5830 // Get the overhead of the extractelement and insertelement instructions 5831 // we might create due to scalarization. 5832 Cost += getScalarizationOverhead(I, VF); 5833 5834 // If we have a predicated store, it may not be executed for each vector 5835 // lane. Scale the cost by the probability of executing the predicated 5836 // block. 5837 if (isPredicatedInst(I)) { 5838 Cost /= getReciprocalPredBlockProb(); 5839 5840 if (useEmulatedMaskMemRefHack(I)) 5841 // Artificially setting to a high enough value to practically disable 5842 // vectorization with such operations. 5843 Cost = 3000000; 5844 } 5845 5846 return Cost; 5847 } 5848 5849 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, 5850 unsigned VF) { 5851 Type *ValTy = getMemInstValueType(I); 5852 Type *VectorTy = ToVectorTy(ValTy, VF); 5853 Value *Ptr = getLoadStorePointerOperand(I); 5854 unsigned AS = getLoadStoreAddressSpace(I); 5855 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); 5856 5857 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 5858 "Stride should be 1 or -1 for consecutive memory access"); 5859 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5860 unsigned Cost = 0; 5861 if (Legal->isMaskRequired(I)) 5862 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, 5863 Alignment ? Alignment->value() : 0, AS); 5864 else 5865 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); 5866 5867 bool Reverse = ConsecutiveStride < 0; 5868 if (Reverse) 5869 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5870 return Cost; 5871 } 5872 5873 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, 5874 unsigned VF) { 5875 Type *ValTy = getMemInstValueType(I); 5876 Type *VectorTy = ToVectorTy(ValTy, VF); 5877 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5878 unsigned AS = getLoadStoreAddressSpace(I); 5879 if (isa<LoadInst>(I)) { 5880 return TTI.getAddressComputationCost(ValTy) + 5881 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + 5882 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); 5883 } 5884 StoreInst *SI = cast<StoreInst>(I); 5885 5886 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); 5887 return TTI.getAddressComputationCost(ValTy) + 5888 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + 5889 (isLoopInvariantStoreValue 5890 ? 0 5891 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, 5892 VF - 1)); 5893 } 5894 5895 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, 5896 unsigned VF) { 5897 Type *ValTy = getMemInstValueType(I); 5898 Type *VectorTy = ToVectorTy(ValTy, VF); 5899 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5900 Value *Ptr = getLoadStorePointerOperand(I); 5901 5902 return TTI.getAddressComputationCost(VectorTy) + 5903 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, 5904 Legal->isMaskRequired(I), 5905 Alignment ? Alignment->value() : 0, I); 5906 } 5907 5908 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, 5909 unsigned VF) { 5910 Type *ValTy = getMemInstValueType(I); 5911 Type *VectorTy = ToVectorTy(ValTy, VF); 5912 unsigned AS = getLoadStoreAddressSpace(I); 5913 5914 auto Group = getInterleavedAccessGroup(I); 5915 assert(Group && "Fail to get an interleaved access group."); 5916 5917 unsigned InterleaveFactor = Group->getFactor(); 5918 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); 5919 5920 // Holds the indices of existing members in an interleaved load group. 5921 // An interleaved store group doesn't need this as it doesn't allow gaps. 5922 SmallVector<unsigned, 4> Indices; 5923 if (isa<LoadInst>(I)) { 5924 for (unsigned i = 0; i < InterleaveFactor; i++) 5925 if (Group->getMember(i)) 5926 Indices.push_back(i); 5927 } 5928 5929 // Calculate the cost of the whole interleaved group. 5930 bool UseMaskForGaps = 5931 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); 5932 unsigned Cost = TTI.getInterleavedMemoryOpCost( 5933 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, 5934 Group->getAlign().value(), AS, Legal->isMaskRequired(I), UseMaskForGaps); 5935 5936 if (Group->isReverse()) { 5937 // TODO: Add support for reversed masked interleaved access. 5938 assert(!Legal->isMaskRequired(I) && 5939 "Reverse masked interleaved access not supported."); 5940 Cost += Group->getNumMembers() * 5941 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); 5942 } 5943 return Cost; 5944 } 5945 5946 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, 5947 unsigned VF) { 5948 // Calculate scalar cost only. Vectorization cost should be ready at this 5949 // moment. 5950 if (VF == 1) { 5951 Type *ValTy = getMemInstValueType(I); 5952 const MaybeAlign Alignment = getLoadStoreAlignment(I); 5953 unsigned AS = getLoadStoreAddressSpace(I); 5954 5955 return TTI.getAddressComputationCost(ValTy) + 5956 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); 5957 } 5958 return getWideningCost(I, VF); 5959 } 5960 5961 LoopVectorizationCostModel::VectorizationCostTy 5962 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { 5963 // If we know that this instruction will remain uniform, check the cost of 5964 // the scalar version. 5965 if (isUniformAfterVectorization(I, VF)) 5966 VF = 1; 5967 5968 if (VF > 1 && isProfitableToScalarize(I, VF)) 5969 return VectorizationCostTy(InstsToScalarize[VF][I], false); 5970 5971 // Forced scalars do not have any scalarization overhead. 5972 auto ForcedScalar = ForcedScalars.find(VF); 5973 if (VF > 1 && ForcedScalar != ForcedScalars.end()) { 5974 auto InstSet = ForcedScalar->second; 5975 if (InstSet.find(I) != InstSet.end()) 5976 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); 5977 } 5978 5979 Type *VectorTy; 5980 unsigned C = getInstructionCost(I, VF, VectorTy); 5981 5982 bool TypeNotScalarized = 5983 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF; 5984 return VectorizationCostTy(C, TypeNotScalarized); 5985 } 5986 5987 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, 5988 unsigned VF) { 5989 5990 if (VF == 1) 5991 return 0; 5992 5993 unsigned Cost = 0; 5994 Type *RetTy = ToVectorTy(I->getType(), VF); 5995 if (!RetTy->isVoidTy() && 5996 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) 5997 Cost += TTI.getScalarizationOverhead(RetTy, true, false); 5998 5999 // Some targets keep addresses scalar. 6000 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing()) 6001 return Cost; 6002 6003 // Some targets support efficient element stores. 6004 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore()) 6005 return Cost; 6006 6007 // Collect operands to consider. 6008 CallInst *CI = dyn_cast<CallInst>(I); 6009 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands(); 6010 6011 // Skip operands that do not require extraction/scalarization and do not incur 6012 // any overhead. 6013 return Cost + TTI.getOperandsScalarizationOverhead( 6014 filterExtractingOperands(Ops, VF), VF); 6015 } 6016 6017 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { 6018 if (VF == 1) 6019 return; 6020 NumPredStores = 0; 6021 for (BasicBlock *BB : TheLoop->blocks()) { 6022 // For each instruction in the old loop. 6023 for (Instruction &I : *BB) { 6024 Value *Ptr = getLoadStorePointerOperand(&I); 6025 if (!Ptr) 6026 continue; 6027 6028 // TODO: We should generate better code and update the cost model for 6029 // predicated uniform stores. Today they are treated as any other 6030 // predicated store (see added test cases in 6031 // invariant-store-vectorization.ll). 6032 if (isa<StoreInst>(&I) && isScalarWithPredication(&I)) 6033 NumPredStores++; 6034 6035 if (Legal->isUniform(Ptr) && 6036 // Conditional loads and stores should be scalarized and predicated. 6037 // isScalarWithPredication cannot be used here since masked 6038 // gather/scatters are not considered scalar with predication. 6039 !Legal->blockNeedsPredication(I.getParent())) { 6040 // TODO: Avoid replicating loads and stores instead of 6041 // relying on instcombine to remove them. 6042 // Load: Scalar load + broadcast 6043 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6044 unsigned Cost = getUniformMemOpCost(&I, VF); 6045 setWideningDecision(&I, VF, CM_Scalarize, Cost); 6046 continue; 6047 } 6048 6049 // We assume that widening is the best solution when possible. 6050 if (memoryInstructionCanBeWidened(&I, VF)) { 6051 unsigned Cost = getConsecutiveMemOpCost(&I, VF); 6052 int ConsecutiveStride = 6053 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I)); 6054 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && 6055 "Expected consecutive stride."); 6056 InstWidening Decision = 6057 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; 6058 setWideningDecision(&I, VF, Decision, Cost); 6059 continue; 6060 } 6061 6062 // Choose between Interleaving, Gather/Scatter or Scalarization. 6063 unsigned InterleaveCost = std::numeric_limits<unsigned>::max(); 6064 unsigned NumAccesses = 1; 6065 if (isAccessInterleaved(&I)) { 6066 auto Group = getInterleavedAccessGroup(&I); 6067 assert(Group && "Fail to get an interleaved access group."); 6068 6069 // Make one decision for the whole group. 6070 if (getWideningDecision(&I, VF) != CM_Unknown) 6071 continue; 6072 6073 NumAccesses = Group->getNumMembers(); 6074 if (interleavedAccessCanBeWidened(&I, VF)) 6075 InterleaveCost = getInterleaveGroupCost(&I, VF); 6076 } 6077 6078 unsigned GatherScatterCost = 6079 isLegalGatherOrScatter(&I) 6080 ? getGatherScatterCost(&I, VF) * NumAccesses 6081 : std::numeric_limits<unsigned>::max(); 6082 6083 unsigned ScalarizationCost = 6084 getMemInstScalarizationCost(&I, VF) * NumAccesses; 6085 6086 // Choose better solution for the current VF, 6087 // write down this decision and use it during vectorization. 6088 unsigned Cost; 6089 InstWidening Decision; 6090 if (InterleaveCost <= GatherScatterCost && 6091 InterleaveCost < ScalarizationCost) { 6092 Decision = CM_Interleave; 6093 Cost = InterleaveCost; 6094 } else if (GatherScatterCost < ScalarizationCost) { 6095 Decision = CM_GatherScatter; 6096 Cost = GatherScatterCost; 6097 } else { 6098 Decision = CM_Scalarize; 6099 Cost = ScalarizationCost; 6100 } 6101 // If the instructions belongs to an interleave group, the whole group 6102 // receives the same decision. The whole group receives the cost, but 6103 // the cost will actually be assigned to one instruction. 6104 if (auto Group = getInterleavedAccessGroup(&I)) 6105 setWideningDecision(Group, VF, Decision, Cost); 6106 else 6107 setWideningDecision(&I, VF, Decision, Cost); 6108 } 6109 } 6110 6111 // Make sure that any load of address and any other address computation 6112 // remains scalar unless there is gather/scatter support. This avoids 6113 // inevitable extracts into address registers, and also has the benefit of 6114 // activating LSR more, since that pass can't optimize vectorized 6115 // addresses. 6116 if (TTI.prefersVectorizedAddressing()) 6117 return; 6118 6119 // Start with all scalar pointer uses. 6120 SmallPtrSet<Instruction *, 8> AddrDefs; 6121 for (BasicBlock *BB : TheLoop->blocks()) 6122 for (Instruction &I : *BB) { 6123 Instruction *PtrDef = 6124 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I)); 6125 if (PtrDef && TheLoop->contains(PtrDef) && 6126 getWideningDecision(&I, VF) != CM_GatherScatter) 6127 AddrDefs.insert(PtrDef); 6128 } 6129 6130 // Add all instructions used to generate the addresses. 6131 SmallVector<Instruction *, 4> Worklist; 6132 for (auto *I : AddrDefs) 6133 Worklist.push_back(I); 6134 while (!Worklist.empty()) { 6135 Instruction *I = Worklist.pop_back_val(); 6136 for (auto &Op : I->operands()) 6137 if (auto *InstOp = dyn_cast<Instruction>(Op)) 6138 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && 6139 AddrDefs.insert(InstOp).second) 6140 Worklist.push_back(InstOp); 6141 } 6142 6143 for (auto *I : AddrDefs) { 6144 if (isa<LoadInst>(I)) { 6145 // Setting the desired widening decision should ideally be handled in 6146 // by cost functions, but since this involves the task of finding out 6147 // if the loaded register is involved in an address computation, it is 6148 // instead changed here when we know this is the case. 6149 InstWidening Decision = getWideningDecision(I, VF); 6150 if (Decision == CM_Widen || Decision == CM_Widen_Reverse) 6151 // Scalarize a widened load of address. 6152 setWideningDecision(I, VF, CM_Scalarize, 6153 (VF * getMemoryInstructionCost(I, 1))); 6154 else if (auto Group = getInterleavedAccessGroup(I)) { 6155 // Scalarize an interleave group of address loads. 6156 for (unsigned I = 0; I < Group->getFactor(); ++I) { 6157 if (Instruction *Member = Group->getMember(I)) 6158 setWideningDecision(Member, VF, CM_Scalarize, 6159 (VF * getMemoryInstructionCost(Member, 1))); 6160 } 6161 } 6162 } else 6163 // Make sure I gets scalarized and a cost estimate without 6164 // scalarization overhead. 6165 ForcedScalars[VF].insert(I); 6166 } 6167 } 6168 6169 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, 6170 unsigned VF, 6171 Type *&VectorTy) { 6172 Type *RetTy = I->getType(); 6173 if (canTruncateToMinimalBitwidth(I, VF)) 6174 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); 6175 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF); 6176 auto SE = PSE.getSE(); 6177 6178 // TODO: We need to estimate the cost of intrinsic calls. 6179 switch (I->getOpcode()) { 6180 case Instruction::GetElementPtr: 6181 // We mark this instruction as zero-cost because the cost of GEPs in 6182 // vectorized code depends on whether the corresponding memory instruction 6183 // is scalarized or not. Therefore, we handle GEPs with the memory 6184 // instruction cost. 6185 return 0; 6186 case Instruction::Br: { 6187 // In cases of scalarized and predicated instructions, there will be VF 6188 // predicated blocks in the vectorized loop. Each branch around these 6189 // blocks requires also an extract of its vector compare i1 element. 6190 bool ScalarPredicatedBB = false; 6191 BranchInst *BI = cast<BranchInst>(I); 6192 if (VF > 1 && BI->isConditional() && 6193 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) != 6194 PredicatedBBsAfterVectorization.end() || 6195 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) != 6196 PredicatedBBsAfterVectorization.end())) 6197 ScalarPredicatedBB = true; 6198 6199 if (ScalarPredicatedBB) { 6200 // Return cost for branches around scalarized and predicated blocks. 6201 Type *Vec_i1Ty = 6202 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); 6203 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) + 6204 (TTI.getCFInstrCost(Instruction::Br) * VF)); 6205 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1) 6206 // The back-edge branch will remain, as will all scalar branches. 6207 return TTI.getCFInstrCost(Instruction::Br); 6208 else 6209 // This branch will be eliminated by if-conversion. 6210 return 0; 6211 // Note: We currently assume zero cost for an unconditional branch inside 6212 // a predicated block since it will become a fall-through, although we 6213 // may decide in the future to call TTI for all branches. 6214 } 6215 case Instruction::PHI: { 6216 auto *Phi = cast<PHINode>(I); 6217 6218 // First-order recurrences are replaced by vector shuffles inside the loop. 6219 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. 6220 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi)) 6221 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, 6222 VectorTy, VF - 1, VectorType::get(RetTy, 1)); 6223 6224 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6225 // converted into select instructions. We require N - 1 selects per phi 6226 // node, where N is the number of incoming values. 6227 if (VF > 1 && Phi->getParent() != TheLoop->getHeader()) 6228 return (Phi->getNumIncomingValues() - 1) * 6229 TTI.getCmpSelInstrCost( 6230 Instruction::Select, ToVectorTy(Phi->getType(), VF), 6231 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF)); 6232 6233 return TTI.getCFInstrCost(Instruction::PHI); 6234 } 6235 case Instruction::UDiv: 6236 case Instruction::SDiv: 6237 case Instruction::URem: 6238 case Instruction::SRem: 6239 // If we have a predicated instruction, it may not be executed for each 6240 // vector lane. Get the scalarization cost and scale this amount by the 6241 // probability of executing the predicated block. If the instruction is not 6242 // predicated, we fall through to the next case. 6243 if (VF > 1 && isScalarWithPredication(I)) { 6244 unsigned Cost = 0; 6245 6246 // These instructions have a non-void type, so account for the phi nodes 6247 // that we will create. This cost is likely to be zero. The phi node 6248 // cost, if any, should be scaled by the block probability because it 6249 // models a copy at the end of each predicated block. 6250 Cost += VF * TTI.getCFInstrCost(Instruction::PHI); 6251 6252 // The cost of the non-predicated instruction. 6253 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy); 6254 6255 // The cost of insertelement and extractelement instructions needed for 6256 // scalarization. 6257 Cost += getScalarizationOverhead(I, VF); 6258 6259 // Scale the cost by the probability of executing the predicated blocks. 6260 // This assumes the predicated block for each vector lane is equally 6261 // likely. 6262 return Cost / getReciprocalPredBlockProb(); 6263 } 6264 LLVM_FALLTHROUGH; 6265 case Instruction::Add: 6266 case Instruction::FAdd: 6267 case Instruction::Sub: 6268 case Instruction::FSub: 6269 case Instruction::Mul: 6270 case Instruction::FMul: 6271 case Instruction::FDiv: 6272 case Instruction::FRem: 6273 case Instruction::Shl: 6274 case Instruction::LShr: 6275 case Instruction::AShr: 6276 case Instruction::And: 6277 case Instruction::Or: 6278 case Instruction::Xor: { 6279 // Since we will replace the stride by 1 the multiplication should go away. 6280 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) 6281 return 0; 6282 // Certain instructions can be cheaper to vectorize if they have a constant 6283 // second vector operand. One example of this are shifts on x86. 6284 Value *Op2 = I->getOperand(1); 6285 TargetTransformInfo::OperandValueProperties Op2VP; 6286 TargetTransformInfo::OperandValueKind Op2VK = 6287 TTI.getOperandInfo(Op2, Op2VP); 6288 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2)) 6289 Op2VK = TargetTransformInfo::OK_UniformValue; 6290 6291 SmallVector<const Value *, 4> Operands(I->operand_values()); 6292 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6293 return N * TTI.getArithmeticInstrCost( 6294 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6295 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); 6296 } 6297 case Instruction::FNeg: { 6298 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6299 return N * TTI.getArithmeticInstrCost( 6300 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, 6301 TargetTransformInfo::OK_AnyValue, 6302 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, 6303 I->getOperand(0), I); 6304 } 6305 case Instruction::Select: { 6306 SelectInst *SI = cast<SelectInst>(I); 6307 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); 6308 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); 6309 Type *CondTy = SI->getCondition()->getType(); 6310 if (!ScalarCond) 6311 CondTy = VectorType::get(CondTy, VF); 6312 6313 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I); 6314 } 6315 case Instruction::ICmp: 6316 case Instruction::FCmp: { 6317 Type *ValTy = I->getOperand(0)->getType(); 6318 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0)); 6319 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) 6320 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); 6321 VectorTy = ToVectorTy(ValTy, VF); 6322 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I); 6323 } 6324 case Instruction::Store: 6325 case Instruction::Load: { 6326 unsigned Width = VF; 6327 if (Width > 1) { 6328 InstWidening Decision = getWideningDecision(I, Width); 6329 assert(Decision != CM_Unknown && 6330 "CM decision should be taken at this point"); 6331 if (Decision == CM_Scalarize) 6332 Width = 1; 6333 } 6334 VectorTy = ToVectorTy(getMemInstValueType(I), Width); 6335 return getMemoryInstructionCost(I, VF); 6336 } 6337 case Instruction::ZExt: 6338 case Instruction::SExt: 6339 case Instruction::FPToUI: 6340 case Instruction::FPToSI: 6341 case Instruction::FPExt: 6342 case Instruction::PtrToInt: 6343 case Instruction::IntToPtr: 6344 case Instruction::SIToFP: 6345 case Instruction::UIToFP: 6346 case Instruction::Trunc: 6347 case Instruction::FPTrunc: 6348 case Instruction::BitCast: { 6349 // We optimize the truncation of induction variables having constant 6350 // integer steps. The cost of these truncations is the same as the scalar 6351 // operation. 6352 if (isOptimizableIVTruncate(I, VF)) { 6353 auto *Trunc = cast<TruncInst>(I); 6354 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(), 6355 Trunc->getSrcTy(), Trunc); 6356 } 6357 6358 Type *SrcScalarTy = I->getOperand(0)->getType(); 6359 Type *SrcVecTy = 6360 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; 6361 if (canTruncateToMinimalBitwidth(I, VF)) { 6362 // This cast is going to be shrunk. This may remove the cast or it might 6363 // turn it into slightly different cast. For example, if MinBW == 16, 6364 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". 6365 // 6366 // Calculate the modified src and dest types. 6367 Type *MinVecTy = VectorTy; 6368 if (I->getOpcode() == Instruction::Trunc) { 6369 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); 6370 VectorTy = 6371 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6372 } else if (I->getOpcode() == Instruction::ZExt || 6373 I->getOpcode() == Instruction::SExt) { 6374 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); 6375 VectorTy = 6376 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy); 6377 } 6378 } 6379 6380 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; 6381 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I); 6382 } 6383 case Instruction::Call: { 6384 bool NeedToScalarize; 6385 CallInst *CI = cast<CallInst>(I); 6386 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); 6387 if (getVectorIntrinsicIDForCall(CI, TLI)) 6388 return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); 6389 return CallCost; 6390 } 6391 default: 6392 // The cost of executing VF copies of the scalar instruction. This opcode 6393 // is unknown. Assume that it is the same as 'mul'. 6394 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + 6395 getScalarizationOverhead(I, VF); 6396 } // end of switch. 6397 } 6398 6399 char LoopVectorize::ID = 0; 6400 6401 static const char lv_name[] = "Loop Vectorization"; 6402 6403 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) 6404 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 6405 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) 6406 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 6407 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 6408 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 6409 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) 6410 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 6411 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 6412 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 6413 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) 6414 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) 6415 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 6416 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 6417 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy) 6418 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) 6419 6420 namespace llvm { 6421 6422 Pass *createLoopVectorizePass() { return new LoopVectorize(); } 6423 6424 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced, 6425 bool VectorizeOnlyWhenForced) { 6426 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced); 6427 } 6428 6429 } // end namespace llvm 6430 6431 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { 6432 // Check if the pointer operand of a load or store instruction is 6433 // consecutive. 6434 if (auto *Ptr = getLoadStorePointerOperand(Inst)) 6435 return Legal->isConsecutivePtr(Ptr); 6436 return false; 6437 } 6438 6439 void LoopVectorizationCostModel::collectValuesToIgnore() { 6440 // Ignore ephemeral values. 6441 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); 6442 6443 // Ignore type-promoting instructions we identified during reduction 6444 // detection. 6445 for (auto &Reduction : Legal->getReductionVars()) { 6446 RecurrenceDescriptor &RedDes = Reduction.second; 6447 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); 6448 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6449 } 6450 // Ignore type-casting instructions we identified during induction 6451 // detection. 6452 for (auto &Induction : Legal->getInductionVars()) { 6453 InductionDescriptor &IndDes = Induction.second; 6454 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6455 VecValuesToIgnore.insert(Casts.begin(), Casts.end()); 6456 } 6457 } 6458 6459 // TODO: we could return a pair of values that specify the max VF and 6460 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 6461 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 6462 // doesn't have a cost model that can choose which plan to execute if 6463 // more than one is generated. 6464 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, 6465 LoopVectorizationCostModel &CM) { 6466 unsigned WidestType; 6467 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); 6468 return WidestVectorRegBits / WidestType; 6469 } 6470 6471 VectorizationFactor 6472 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { 6473 unsigned VF = UserVF; 6474 // Outer loop handling: They may require CFG and instruction level 6475 // transformations before even evaluating whether vectorization is profitable. 6476 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 6477 // the vectorization pipeline. 6478 if (!OrigLoop->empty()) { 6479 // If the user doesn't provide a vectorization factor, determine a 6480 // reasonable one. 6481 if (!UserVF) { 6482 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM); 6483 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); 6484 6485 // Make sure we have a VF > 1 for stress testing. 6486 if (VPlanBuildStressTest && VF < 2) { 6487 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " 6488 << "overriding computed VF.\n"); 6489 VF = 4; 6490 } 6491 } 6492 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 6493 assert(isPowerOf2_32(VF) && "VF needs to be a power of two"); 6494 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF 6495 << " to build VPlans.\n"); 6496 buildVPlans(VF, VF); 6497 6498 // For VPlan build stress testing, we bail out after VPlan construction. 6499 if (VPlanBuildStressTest) 6500 return VectorizationFactor::Disabled(); 6501 6502 return {VF, 0}; 6503 } 6504 6505 LLVM_DEBUG( 6506 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " 6507 "VPlan-native path.\n"); 6508 return VectorizationFactor::Disabled(); 6509 } 6510 6511 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { 6512 assert(OrigLoop->empty() && "Inner loop expected."); 6513 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); 6514 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. 6515 return None; 6516 6517 // Invalidate interleave groups if all blocks of loop will be predicated. 6518 if (CM.blockNeedsPredication(OrigLoop->getHeader()) && 6519 !useMaskedInterleavedAccesses(*TTI)) { 6520 LLVM_DEBUG( 6521 dbgs() 6522 << "LV: Invalidate all interleaved groups due to fold-tail by masking " 6523 "which requires masked-interleaved support.\n"); 6524 CM.InterleaveInfo.reset(); 6525 } 6526 6527 if (UserVF) { 6528 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); 6529 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); 6530 // Collect the instructions (and their associated costs) that will be more 6531 // profitable to scalarize. 6532 CM.selectUserVectorizationFactor(UserVF); 6533 buildVPlansWithVPRecipes(UserVF, UserVF); 6534 LLVM_DEBUG(printPlans(dbgs())); 6535 return {{UserVF, 0}}; 6536 } 6537 6538 unsigned MaxVF = MaybeMaxVF.getValue(); 6539 assert(MaxVF != 0 && "MaxVF is zero."); 6540 6541 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { 6542 // Collect Uniform and Scalar instructions after vectorization with VF. 6543 CM.collectUniformsAndScalars(VF); 6544 6545 // Collect the instructions (and their associated costs) that will be more 6546 // profitable to scalarize. 6547 if (VF > 1) 6548 CM.collectInstsToScalarize(VF); 6549 } 6550 6551 buildVPlansWithVPRecipes(1, MaxVF); 6552 LLVM_DEBUG(printPlans(dbgs())); 6553 if (MaxVF == 1) 6554 return VectorizationFactor::Disabled(); 6555 6556 // Select the optimal vectorization factor. 6557 return CM.selectVectorizationFactor(MaxVF); 6558 } 6559 6560 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { 6561 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF 6562 << '\n'); 6563 BestVF = VF; 6564 BestUF = UF; 6565 6566 erase_if(VPlans, [VF](const VPlanPtr &Plan) { 6567 return !Plan->hasVF(VF); 6568 }); 6569 assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); 6570 } 6571 6572 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, 6573 DominatorTree *DT) { 6574 // Perform the actual loop transformation. 6575 6576 // 1. Create a new empty loop. Unlink the old loop and connect the new one. 6577 VPCallbackILV CallbackILV(ILV); 6578 6579 VPTransformState State{BestVF, BestUF, LI, 6580 DT, ILV.Builder, ILV.VectorLoopValueMap, 6581 &ILV, CallbackILV}; 6582 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); 6583 State.TripCount = ILV.getOrCreateTripCount(nullptr); 6584 6585 //===------------------------------------------------===// 6586 // 6587 // Notice: any optimization or new instruction that go 6588 // into the code below should also be implemented in 6589 // the cost-model. 6590 // 6591 //===------------------------------------------------===// 6592 6593 // 2. Copy and widen instructions from the old loop into the new loop. 6594 assert(VPlans.size() == 1 && "Not a single VPlan to execute."); 6595 VPlans.front()->execute(&State); 6596 6597 // 3. Fix the vectorized code: take care of header phi's, live-outs, 6598 // predication, updating analyses. 6599 ILV.fixVectorizedLoop(); 6600 } 6601 6602 void LoopVectorizationPlanner::collectTriviallyDeadInstructions( 6603 SmallPtrSetImpl<Instruction *> &DeadInstructions) { 6604 BasicBlock *Latch = OrigLoop->getLoopLatch(); 6605 6606 // We create new control-flow for the vectorized loop, so the original 6607 // condition will be dead after vectorization if it's only used by the 6608 // branch. 6609 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); 6610 if (Cmp && Cmp->hasOneUse()) 6611 DeadInstructions.insert(Cmp); 6612 6613 // We create new "steps" for induction variable updates to which the original 6614 // induction variables map. An original update instruction will be dead if 6615 // all its users except the induction variable are dead. 6616 for (auto &Induction : Legal->getInductionVars()) { 6617 PHINode *Ind = Induction.first; 6618 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch)); 6619 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { 6620 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) != 6621 DeadInstructions.end(); 6622 })) 6623 DeadInstructions.insert(IndUpdate); 6624 6625 // We record as "Dead" also the type-casting instructions we had identified 6626 // during induction analysis. We don't need any handling for them in the 6627 // vectorized loop because we have proven that, under a proper runtime 6628 // test guarding the vectorized loop, the value of the phi, and the casted 6629 // value of the phi, are the same. The last instruction in this casting chain 6630 // will get its scalar/vector/widened def from the scalar/vector/widened def 6631 // of the respective phi node. Any other casts in the induction def-use chain 6632 // have no other uses outside the phi update chain, and will be ignored. 6633 InductionDescriptor &IndDes = Induction.second; 6634 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); 6635 DeadInstructions.insert(Casts.begin(), Casts.end()); 6636 } 6637 } 6638 6639 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } 6640 6641 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } 6642 6643 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step, 6644 Instruction::BinaryOps BinOp) { 6645 // When unrolling and the VF is 1, we only need to add a simple scalar. 6646 Type *Ty = Val->getType(); 6647 assert(!Ty->isVectorTy() && "Val must be a scalar"); 6648 6649 if (Ty->isFloatingPointTy()) { 6650 Constant *C = ConstantFP::get(Ty, (double)StartIdx); 6651 6652 // Floating point operations had to be 'fast' to enable the unrolling. 6653 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step)); 6654 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp)); 6655 } 6656 Constant *C = ConstantInt::get(Ty, StartIdx); 6657 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); 6658 } 6659 6660 static void AddRuntimeUnrollDisableMetaData(Loop *L) { 6661 SmallVector<Metadata *, 4> MDs; 6662 // Reserve first location for self reference to the LoopID metadata node. 6663 MDs.push_back(nullptr); 6664 bool IsUnrollMetadata = false; 6665 MDNode *LoopID = L->getLoopID(); 6666 if (LoopID) { 6667 // First find existing loop unrolling disable metadata. 6668 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { 6669 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); 6670 if (MD) { 6671 const auto *S = dyn_cast<MDString>(MD->getOperand(0)); 6672 IsUnrollMetadata = 6673 S && S->getString().startswith("llvm.loop.unroll.disable"); 6674 } 6675 MDs.push_back(LoopID->getOperand(i)); 6676 } 6677 } 6678 6679 if (!IsUnrollMetadata) { 6680 // Add runtime unroll disable metadata. 6681 LLVMContext &Context = L->getHeader()->getContext(); 6682 SmallVector<Metadata *, 1> DisableOperands; 6683 DisableOperands.push_back( 6684 MDString::get(Context, "llvm.loop.unroll.runtime.disable")); 6685 MDNode *DisableNode = MDNode::get(Context, DisableOperands); 6686 MDs.push_back(DisableNode); 6687 MDNode *NewLoopID = MDNode::get(Context, MDs); 6688 // Set operand 0 to refer to the loop id itself. 6689 NewLoopID->replaceOperandWith(0, NewLoopID); 6690 L->setLoopID(NewLoopID); 6691 } 6692 } 6693 6694 bool LoopVectorizationPlanner::getDecisionAndClampRange( 6695 const std::function<bool(unsigned)> &Predicate, VFRange &Range) { 6696 assert(Range.End > Range.Start && "Trying to test an empty VF range."); 6697 bool PredicateAtRangeStart = Predicate(Range.Start); 6698 6699 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) 6700 if (Predicate(TmpVF) != PredicateAtRangeStart) { 6701 Range.End = TmpVF; 6702 break; 6703 } 6704 6705 return PredicateAtRangeStart; 6706 } 6707 6708 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, 6709 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range 6710 /// of VF's starting at a given VF and extending it as much as possible. Each 6711 /// vectorization decision can potentially shorten this sub-range during 6712 /// buildVPlan(). 6713 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { 6714 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 6715 VFRange SubRange = {VF, MaxVF + 1}; 6716 VPlans.push_back(buildVPlan(SubRange)); 6717 VF = SubRange.End; 6718 } 6719 } 6720 6721 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, 6722 VPlanPtr &Plan) { 6723 assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); 6724 6725 // Look for cached value. 6726 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); 6727 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge); 6728 if (ECEntryIt != EdgeMaskCache.end()) 6729 return ECEntryIt->second; 6730 6731 VPValue *SrcMask = createBlockInMask(Src, Plan); 6732 6733 // The terminator has to be a branch inst! 6734 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); 6735 assert(BI && "Unexpected terminator found"); 6736 6737 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) 6738 return EdgeMaskCache[Edge] = SrcMask; 6739 6740 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); 6741 assert(EdgeMask && "No Edge Mask found for condition"); 6742 6743 if (BI->getSuccessor(0) != Dst) 6744 EdgeMask = Builder.createNot(EdgeMask); 6745 6746 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND. 6747 EdgeMask = Builder.createAnd(EdgeMask, SrcMask); 6748 6749 return EdgeMaskCache[Edge] = EdgeMask; 6750 } 6751 6752 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { 6753 assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); 6754 6755 // Look for cached value. 6756 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB); 6757 if (BCEntryIt != BlockMaskCache.end()) 6758 return BCEntryIt->second; 6759 6760 // All-one mask is modelled as no-mask following the convention for masked 6761 // load/store/gather/scatter. Initialize BlockMask to no-mask. 6762 VPValue *BlockMask = nullptr; 6763 6764 if (OrigLoop->getHeader() == BB) { 6765 if (!CM.blockNeedsPredication(BB)) 6766 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. 6767 6768 // Introduce the early-exit compare IV <= BTC to form header block mask. 6769 // This is used instead of IV < TC because TC may wrap, unlike BTC. 6770 VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction()); 6771 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); 6772 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); 6773 return BlockMaskCache[BB] = BlockMask; 6774 } 6775 6776 // This is the block mask. We OR all incoming edges. 6777 for (auto *Predecessor : predecessors(BB)) { 6778 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan); 6779 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too. 6780 return BlockMaskCache[BB] = EdgeMask; 6781 6782 if (!BlockMask) { // BlockMask has its initialized nullptr value. 6783 BlockMask = EdgeMask; 6784 continue; 6785 } 6786 6787 BlockMask = Builder.createOr(BlockMask, EdgeMask); 6788 } 6789 6790 return BlockMaskCache[BB] = BlockMask; 6791 } 6792 6793 VPWidenMemoryInstructionRecipe * 6794 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, 6795 VPlanPtr &Plan) { 6796 if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) 6797 return nullptr; 6798 6799 auto willWiden = [&](unsigned VF) -> bool { 6800 if (VF == 1) 6801 return false; 6802 LoopVectorizationCostModel::InstWidening Decision = 6803 CM.getWideningDecision(I, VF); 6804 assert(Decision != LoopVectorizationCostModel::CM_Unknown && 6805 "CM decision should be taken at this point."); 6806 if (Decision == LoopVectorizationCostModel::CM_Interleave) 6807 return true; 6808 if (CM.isScalarAfterVectorization(I, VF) || 6809 CM.isProfitableToScalarize(I, VF)) 6810 return false; 6811 return Decision != LoopVectorizationCostModel::CM_Scalarize; 6812 }; 6813 6814 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6815 return nullptr; 6816 6817 VPValue *Mask = nullptr; 6818 if (Legal->isMaskRequired(I)) 6819 Mask = createBlockInMask(I->getParent(), Plan); 6820 6821 VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); 6822 if (LoadInst *Load = dyn_cast<LoadInst>(I)) 6823 return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask); 6824 6825 StoreInst *Store = cast<StoreInst>(I); 6826 VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); 6827 return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); 6828 } 6829 6830 VPWidenIntOrFpInductionRecipe * 6831 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) { 6832 if (PHINode *Phi = dyn_cast<PHINode>(I)) { 6833 // Check if this is an integer or fp induction. If so, build the recipe that 6834 // produces its scalar and vector values. 6835 InductionDescriptor II = Legal->getInductionVars().lookup(Phi); 6836 if (II.getKind() == InductionDescriptor::IK_IntInduction || 6837 II.getKind() == InductionDescriptor::IK_FpInduction) 6838 return new VPWidenIntOrFpInductionRecipe(Phi); 6839 6840 return nullptr; 6841 } 6842 6843 // Optimize the special case where the source is a constant integer 6844 // induction variable. Notice that we can only optimize the 'trunc' case 6845 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and 6846 // (c) other casts depend on pointer size. 6847 6848 // Determine whether \p K is a truncation based on an induction variable that 6849 // can be optimized. 6850 auto isOptimizableIVTruncate = 6851 [&](Instruction *K) -> std::function<bool(unsigned)> { 6852 return 6853 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; 6854 }; 6855 6856 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange( 6857 isOptimizableIVTruncate(I), Range)) 6858 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), 6859 cast<TruncInst>(I)); 6860 return nullptr; 6861 } 6862 6863 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { 6864 PHINode *Phi = dyn_cast<PHINode>(I); 6865 if (!Phi || Phi->getParent() == OrigLoop->getHeader()) 6866 return nullptr; 6867 6868 // We know that all PHIs in non-header blocks are converted into selects, so 6869 // we don't have to worry about the insertion order and we can just use the 6870 // builder. At this point we generate the predication tree. There may be 6871 // duplications since this is a simple recursive scan, but future 6872 // optimizations will clean it up. 6873 6874 SmallVector<VPValue *, 2> Masks; 6875 unsigned NumIncoming = Phi->getNumIncomingValues(); 6876 for (unsigned In = 0; In < NumIncoming; In++) { 6877 VPValue *EdgeMask = 6878 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan); 6879 assert((EdgeMask || NumIncoming == 1) && 6880 "Multiple predecessors with one having a full mask"); 6881 if (EdgeMask) 6882 Masks.push_back(EdgeMask); 6883 } 6884 return new VPBlendRecipe(Phi, Masks); 6885 } 6886 6887 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, 6888 VFRange &Range) { 6889 6890 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 6891 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 6892 6893 if (IsPredicated) 6894 return false; 6895 6896 auto IsVectorizableOpcode = [](unsigned Opcode) { 6897 switch (Opcode) { 6898 case Instruction::Add: 6899 case Instruction::And: 6900 case Instruction::AShr: 6901 case Instruction::BitCast: 6902 case Instruction::Br: 6903 case Instruction::Call: 6904 case Instruction::FAdd: 6905 case Instruction::FCmp: 6906 case Instruction::FDiv: 6907 case Instruction::FMul: 6908 case Instruction::FNeg: 6909 case Instruction::FPExt: 6910 case Instruction::FPToSI: 6911 case Instruction::FPToUI: 6912 case Instruction::FPTrunc: 6913 case Instruction::FRem: 6914 case Instruction::FSub: 6915 case Instruction::ICmp: 6916 case Instruction::IntToPtr: 6917 case Instruction::Load: 6918 case Instruction::LShr: 6919 case Instruction::Mul: 6920 case Instruction::Or: 6921 case Instruction::PHI: 6922 case Instruction::PtrToInt: 6923 case Instruction::SDiv: 6924 case Instruction::Select: 6925 case Instruction::SExt: 6926 case Instruction::Shl: 6927 case Instruction::SIToFP: 6928 case Instruction::SRem: 6929 case Instruction::Store: 6930 case Instruction::Sub: 6931 case Instruction::Trunc: 6932 case Instruction::UDiv: 6933 case Instruction::UIToFP: 6934 case Instruction::URem: 6935 case Instruction::Xor: 6936 case Instruction::ZExt: 6937 return true; 6938 } 6939 return false; 6940 }; 6941 6942 if (!IsVectorizableOpcode(I->getOpcode())) 6943 return false; 6944 6945 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6946 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6947 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || 6948 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) 6949 return false; 6950 } 6951 6952 auto willWiden = [&](unsigned VF) -> bool { 6953 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || 6954 CM.isProfitableToScalarize(I, VF))) 6955 return false; 6956 if (CallInst *CI = dyn_cast<CallInst>(I)) { 6957 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); 6958 // The following case may be scalarized depending on the VF. 6959 // The flag shows whether we use Intrinsic or a usual Call for vectorized 6960 // version of the instruction. 6961 // Is it beneficial to perform intrinsic call compared to lib call? 6962 bool NeedToScalarize; 6963 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); 6964 bool UseVectorIntrinsic = 6965 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; 6966 return UseVectorIntrinsic || !NeedToScalarize; 6967 } 6968 if (isa<LoadInst>(I) || isa<StoreInst>(I)) { 6969 assert(CM.getWideningDecision(I, VF) == 6970 LoopVectorizationCostModel::CM_Scalarize && 6971 "Memory widening decisions should have been taken care by now"); 6972 return false; 6973 } 6974 return true; 6975 }; 6976 6977 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) 6978 return false; 6979 // If this ingredient's recipe is to be recorded, keep its recipe a singleton 6980 // to avoid having to split recipes later. 6981 bool IsSingleton = Ingredient2Recipe.count(I); 6982 6983 // Success: widen this instruction. 6984 6985 // Use the default widening recipe. We optimize the common case where 6986 // consecutive instructions can be represented by a single recipe. 6987 if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() && 6988 LastExtensibleRecipe->appendInstruction(I)) 6989 return true; 6990 6991 VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I); 6992 if (!IsSingleton) 6993 LastExtensibleRecipe = WidenRecipe; 6994 setRecipe(I, WidenRecipe); 6995 VPBB->appendRecipe(WidenRecipe); 6996 return true; 6997 } 6998 6999 VPBasicBlock *VPRecipeBuilder::handleReplication( 7000 Instruction *I, VFRange &Range, VPBasicBlock *VPBB, 7001 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe, 7002 VPlanPtr &Plan) { 7003 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( 7004 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, 7005 Range); 7006 7007 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( 7008 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); 7009 7010 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); 7011 setRecipe(I, Recipe); 7012 7013 // Find if I uses a predicated instruction. If so, it will use its scalar 7014 // value. Avoid hoisting the insert-element which packs the scalar value into 7015 // a vector value, as that happens iff all users use the vector value. 7016 for (auto &Op : I->operands()) 7017 if (auto *PredInst = dyn_cast<Instruction>(Op)) 7018 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) 7019 PredInst2Recipe[PredInst]->setAlsoPack(false); 7020 7021 // Finalize the recipe for Instr, first if it is not predicated. 7022 if (!IsPredicated) { 7023 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); 7024 VPBB->appendRecipe(Recipe); 7025 return VPBB; 7026 } 7027 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); 7028 assert(VPBB->getSuccessors().empty() && 7029 "VPBB has successors when handling predicated replication."); 7030 // Record predicated instructions for above packing optimizations. 7031 PredInst2Recipe[I] = Recipe; 7032 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); 7033 VPBlockUtils::insertBlockAfter(Region, VPBB); 7034 auto *RegSucc = new VPBasicBlock(); 7035 VPBlockUtils::insertBlockAfter(RegSucc, Region); 7036 return RegSucc; 7037 } 7038 7039 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, 7040 VPRecipeBase *PredRecipe, 7041 VPlanPtr &Plan) { 7042 // Instructions marked for predication are replicated and placed under an 7043 // if-then construct to prevent side-effects. 7044 7045 // Generate recipes to compute the block mask for this region. 7046 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan); 7047 7048 // Build the triangular if-then region. 7049 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); 7050 assert(Instr->getParent() && "Predicated instruction not in any basic block"); 7051 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); 7052 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); 7053 auto *PHIRecipe = 7054 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); 7055 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); 7056 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); 7057 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); 7058 7059 // Note: first set Entry as region entry and then connect successors starting 7060 // from it in order, to propagate the "parent" of each VPBasicBlock. 7061 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry); 7062 VPBlockUtils::connectBlocks(Pred, Exit); 7063 7064 return Region; 7065 } 7066 7067 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, 7068 VPlanPtr &Plan, VPBasicBlock *VPBB) { 7069 VPRecipeBase *Recipe = nullptr; 7070 7071 // First, check for specific widening recipes that deal with memory 7072 // operations, inductions and Phi nodes. 7073 if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) || 7074 (Recipe = tryToOptimizeInduction(Instr, Range)) || 7075 (Recipe = tryToBlend(Instr, Plan)) || 7076 (isa<PHINode>(Instr) && 7077 (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) { 7078 setRecipe(Instr, Recipe); 7079 VPBB->appendRecipe(Recipe); 7080 return true; 7081 } 7082 7083 // Handle GEP widening. 7084 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { 7085 auto Scalarize = [&](unsigned VF) { 7086 return CM.isScalarWithPredication(Instr, VF) || 7087 CM.isScalarAfterVectorization(Instr, VF) || 7088 CM.isProfitableToScalarize(Instr, VF); 7089 }; 7090 if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range)) 7091 return false; 7092 VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop); 7093 setRecipe(Instr, Recipe); 7094 VPBB->appendRecipe(Recipe); 7095 return true; 7096 } 7097 7098 // Check if Instr is to be widened by a general VPWidenRecipe, after 7099 // having first checked for specific widening recipes. 7100 if (tryToWiden(Instr, VPBB, Range)) 7101 return true; 7102 7103 return false; 7104 } 7105 7106 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, 7107 unsigned MaxVF) { 7108 assert(OrigLoop->empty() && "Inner loop expected."); 7109 7110 // Collect conditions feeding internal conditional branches; they need to be 7111 // represented in VPlan for it to model masking. 7112 SmallPtrSet<Value *, 1> NeedDef; 7113 7114 auto *Latch = OrigLoop->getLoopLatch(); 7115 for (BasicBlock *BB : OrigLoop->blocks()) { 7116 if (BB == Latch) 7117 continue; 7118 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator()); 7119 if (Branch && Branch->isConditional()) 7120 NeedDef.insert(Branch->getCondition()); 7121 } 7122 7123 // If the tail is to be folded by masking, the primary induction variable 7124 // needs to be represented in VPlan for it to model early-exit masking. 7125 // Also, both the Phi and the live-out instruction of each reduction are 7126 // required in order to introduce a select between them in VPlan. 7127 if (CM.foldTailByMasking()) { 7128 NeedDef.insert(Legal->getPrimaryInduction()); 7129 for (auto &Reduction : Legal->getReductionVars()) { 7130 NeedDef.insert(Reduction.first); 7131 NeedDef.insert(Reduction.second.getLoopExitInstr()); 7132 } 7133 } 7134 7135 // Collect instructions from the original loop that will become trivially dead 7136 // in the vectorized loop. We don't need to vectorize these instructions. For 7137 // example, original induction update instructions can become dead because we 7138 // separately emit induction "steps" when generating code for the new loop. 7139 // Similarly, we create a new latch condition when setting up the structure 7140 // of the new loop, so the old one can become dead. 7141 SmallPtrSet<Instruction *, 4> DeadInstructions; 7142 collectTriviallyDeadInstructions(DeadInstructions); 7143 7144 // Add assume instructions we need to drop to DeadInstructions, to prevent 7145 // them from being added to the VPlan. 7146 // TODO: We only need to drop assumes in blocks that get flattend. If the 7147 // control flow is preserved, we should keep them. 7148 auto &ConditionalAssumes = Legal->getConditionalAssumes(); 7149 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); 7150 7151 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); 7152 // Dead instructions do not need sinking. Remove them from SinkAfter. 7153 for (Instruction *I : DeadInstructions) 7154 SinkAfter.erase(I); 7155 7156 for (unsigned VF = MinVF; VF < MaxVF + 1;) { 7157 VFRange SubRange = {VF, MaxVF + 1}; 7158 VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, 7159 DeadInstructions, SinkAfter)); 7160 VF = SubRange.End; 7161 } 7162 } 7163 7164 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( 7165 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, 7166 SmallPtrSetImpl<Instruction *> &DeadInstructions, 7167 const DenseMap<Instruction *, Instruction *> &SinkAfter) { 7168 7169 // Hold a mapping from predicated instructions to their recipes, in order to 7170 // fix their AlsoPack behavior if a user is determined to replicate and use a 7171 // scalar instead of vector value. 7172 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; 7173 7174 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; 7175 7176 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); 7177 7178 // --------------------------------------------------------------------------- 7179 // Pre-construction: record ingredients whose recipes we'll need to further 7180 // process after constructing the initial VPlan. 7181 // --------------------------------------------------------------------------- 7182 7183 // Mark instructions we'll need to sink later and their targets as 7184 // ingredients whose recipe we'll need to record. 7185 for (auto &Entry : SinkAfter) { 7186 RecipeBuilder.recordRecipeOf(Entry.first); 7187 RecipeBuilder.recordRecipeOf(Entry.second); 7188 } 7189 7190 // For each interleave group which is relevant for this (possibly trimmed) 7191 // Range, add it to the set of groups to be later applied to the VPlan and add 7192 // placeholders for its members' Recipes which we'll be replacing with a 7193 // single VPInterleaveRecipe. 7194 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { 7195 auto applyIG = [IG, this](unsigned VF) -> bool { 7196 return (VF >= 2 && // Query is illegal for VF == 1 7197 CM.getWideningDecision(IG->getInsertPos(), VF) == 7198 LoopVectorizationCostModel::CM_Interleave); 7199 }; 7200 if (!getDecisionAndClampRange(applyIG, Range)) 7201 continue; 7202 InterleaveGroups.insert(IG); 7203 for (unsigned i = 0; i < IG->getFactor(); i++) 7204 if (Instruction *Member = IG->getMember(i)) 7205 RecipeBuilder.recordRecipeOf(Member); 7206 }; 7207 7208 // --------------------------------------------------------------------------- 7209 // Build initial VPlan: Scan the body of the loop in a topological order to 7210 // visit each basic block after having visited its predecessor basic blocks. 7211 // --------------------------------------------------------------------------- 7212 7213 // Create a dummy pre-entry VPBasicBlock to start building the VPlan. 7214 auto Plan = std::make_unique<VPlan>(); 7215 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); 7216 Plan->setEntry(VPBB); 7217 7218 // Represent values that will have defs inside VPlan. 7219 for (Value *V : NeedDef) 7220 Plan->addVPValue(V); 7221 7222 // Scan the body of the loop in a topological order to visit each basic block 7223 // after having visited its predecessor basic blocks. 7224 LoopBlocksDFS DFS(OrigLoop); 7225 DFS.perform(LI); 7226 7227 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { 7228 // Relevant instructions from basic block BB will be grouped into VPRecipe 7229 // ingredients and fill a new VPBasicBlock. 7230 unsigned VPBBsForBB = 0; 7231 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); 7232 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); 7233 VPBB = FirstVPBBForBB; 7234 Builder.setInsertPoint(VPBB); 7235 7236 // Introduce each ingredient into VPlan. 7237 for (Instruction &I : BB->instructionsWithoutDebug()) { 7238 Instruction *Instr = &I; 7239 7240 // First filter out irrelevant instructions, to ensure no recipes are 7241 // built for them. 7242 if (isa<BranchInst>(Instr) || 7243 DeadInstructions.find(Instr) != DeadInstructions.end()) 7244 continue; 7245 7246 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) 7247 continue; 7248 7249 // Otherwise, if all widening options failed, Instruction is to be 7250 // replicated. This may create a successor for VPBB. 7251 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication( 7252 Instr, Range, VPBB, PredInst2Recipe, Plan); 7253 if (NextVPBB != VPBB) { 7254 VPBB = NextVPBB; 7255 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) 7256 : ""); 7257 } 7258 } 7259 } 7260 7261 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks 7262 // may also be empty, such as the last one VPBB, reflecting original 7263 // basic-blocks with no recipes. 7264 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); 7265 assert(PreEntry->empty() && "Expecting empty pre-entry block."); 7266 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); 7267 VPBlockUtils::disconnectBlocks(PreEntry, Entry); 7268 delete PreEntry; 7269 7270 // --------------------------------------------------------------------------- 7271 // Transform initial VPlan: Apply previously taken decisions, in order, to 7272 // bring the VPlan to its final state. 7273 // --------------------------------------------------------------------------- 7274 7275 // Apply Sink-After legal constraints. 7276 for (auto &Entry : SinkAfter) { 7277 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); 7278 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); 7279 Sink->moveAfter(Target); 7280 } 7281 7282 // Interleave memory: for each Interleave Group we marked earlier as relevant 7283 // for this VPlan, replace the Recipes widening its memory instructions with a 7284 // single VPInterleaveRecipe at its insertion point. 7285 for (auto IG : InterleaveGroups) { 7286 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( 7287 RecipeBuilder.getRecipe(IG->getInsertPos())); 7288 (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) 7289 ->insertBefore(Recipe); 7290 7291 for (unsigned i = 0; i < IG->getFactor(); ++i) 7292 if (Instruction *Member = IG->getMember(i)) { 7293 RecipeBuilder.getRecipe(Member)->eraseFromParent(); 7294 } 7295 } 7296 7297 // Finally, if tail is folded by masking, introduce selects between the phi 7298 // and the live-out instruction of each reduction, at the end of the latch. 7299 if (CM.foldTailByMasking()) { 7300 Builder.setInsertPoint(VPBB); 7301 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); 7302 for (auto &Reduction : Legal->getReductionVars()) { 7303 VPValue *Phi = Plan->getVPValue(Reduction.first); 7304 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); 7305 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); 7306 } 7307 } 7308 7309 std::string PlanName; 7310 raw_string_ostream RSO(PlanName); 7311 unsigned VF = Range.Start; 7312 Plan->addVF(VF); 7313 RSO << "Initial VPlan for VF={" << VF; 7314 for (VF *= 2; VF < Range.End; VF *= 2) { 7315 Plan->addVF(VF); 7316 RSO << "," << VF; 7317 } 7318 RSO << "},UF>=1"; 7319 RSO.flush(); 7320 Plan->setName(PlanName); 7321 7322 return Plan; 7323 } 7324 7325 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { 7326 // Outer loop handling: They may require CFG and instruction level 7327 // transformations before even evaluating whether vectorization is profitable. 7328 // Since we cannot modify the incoming IR, we need to build VPlan upfront in 7329 // the vectorization pipeline. 7330 assert(!OrigLoop->empty()); 7331 assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); 7332 7333 // Create new empty VPlan 7334 auto Plan = std::make_unique<VPlan>(); 7335 7336 // Build hierarchical CFG 7337 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); 7338 HCFGBuilder.buildHierarchicalCFG(); 7339 7340 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2) 7341 Plan->addVF(VF); 7342 7343 if (EnableVPlanPredication) { 7344 VPlanPredicator VPP(*Plan); 7345 VPP.predicate(); 7346 7347 // Avoid running transformation to recipes until masked code generation in 7348 // VPlan-native path is in place. 7349 return Plan; 7350 } 7351 7352 SmallPtrSet<Instruction *, 1> DeadInstructions; 7353 VPlanTransforms::VPInstructionsToVPRecipes( 7354 OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); 7355 return Plan; 7356 } 7357 7358 Value* LoopVectorizationPlanner::VPCallbackILV:: 7359 getOrCreateVectorValues(Value *V, unsigned Part) { 7360 return ILV.getOrCreateVectorValue(V, Part); 7361 } 7362 7363 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( 7364 Value *V, const VPIteration &Instance) { 7365 return ILV.getOrCreateScalarValue(V, Instance); 7366 } 7367 7368 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, 7369 VPSlotTracker &SlotTracker) const { 7370 O << " +\n" 7371 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; 7372 IG->getInsertPos()->printAsOperand(O, false); 7373 O << ", "; 7374 getAddr()->printAsOperand(O, SlotTracker); 7375 VPValue *Mask = getMask(); 7376 if (Mask) { 7377 O << ", "; 7378 Mask->printAsOperand(O, SlotTracker); 7379 } 7380 O << "\\l\""; 7381 for (unsigned i = 0; i < IG->getFactor(); ++i) 7382 if (Instruction *I = IG->getMember(i)) 7383 O << " +\n" 7384 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; 7385 } 7386 7387 void VPWidenRecipe::execute(VPTransformState &State) { 7388 for (auto &Instr : make_range(Begin, End)) 7389 State.ILV->widenInstruction(Instr); 7390 } 7391 7392 void VPWidenGEPRecipe::execute(VPTransformState &State) { 7393 State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, 7394 IsIndexLoopInvariant); 7395 } 7396 7397 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { 7398 assert(!State.Instance && "Int or FP induction being replicated."); 7399 State.ILV->widenIntOrFpInduction(IV, Trunc); 7400 } 7401 7402 void VPWidenPHIRecipe::execute(VPTransformState &State) { 7403 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); 7404 } 7405 7406 void VPBlendRecipe::execute(VPTransformState &State) { 7407 State.ILV->setDebugLocFromInst(State.Builder, Phi); 7408 // We know that all PHIs in non-header blocks are converted into 7409 // selects, so we don't have to worry about the insertion order and we 7410 // can just use the builder. 7411 // At this point we generate the predication tree. There may be 7412 // duplications since this is a simple recursive scan, but future 7413 // optimizations will clean it up. 7414 7415 unsigned NumIncoming = Phi->getNumIncomingValues(); 7416 7417 assert((User || NumIncoming == 1) && 7418 "Multiple predecessors with predecessors having a full mask"); 7419 // Generate a sequence of selects of the form: 7420 // SELECT(Mask3, In3, 7421 // SELECT(Mask2, In2, 7422 // ( ...))) 7423 InnerLoopVectorizer::VectorParts Entry(State.UF); 7424 for (unsigned In = 0; In < NumIncoming; ++In) { 7425 for (unsigned Part = 0; Part < State.UF; ++Part) { 7426 // We might have single edge PHIs (blocks) - use an identity 7427 // 'select' for the first PHI operand. 7428 Value *In0 = 7429 State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part); 7430 if (In == 0) 7431 Entry[Part] = In0; // Initialize with the first incoming value. 7432 else { 7433 // Select between the current value and the previous incoming edge 7434 // based on the incoming mask. 7435 Value *Cond = State.get(User->getOperand(In), Part); 7436 Entry[Part] = 7437 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi"); 7438 } 7439 } 7440 } 7441 for (unsigned Part = 0; Part < State.UF; ++Part) 7442 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]); 7443 } 7444 7445 void VPInterleaveRecipe::execute(VPTransformState &State) { 7446 assert(!State.Instance && "Interleave group being replicated."); 7447 State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(), 7448 getMask()); 7449 } 7450 7451 void VPReplicateRecipe::execute(VPTransformState &State) { 7452 if (State.Instance) { // Generate a single instance. 7453 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); 7454 // Insert scalar instance packing it into a vector. 7455 if (AlsoPack && State.VF > 1) { 7456 // If we're constructing lane 0, initialize to start from undef. 7457 if (State.Instance->Lane == 0) { 7458 Value *Undef = 7459 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); 7460 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); 7461 } 7462 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); 7463 } 7464 return; 7465 } 7466 7467 // Generate scalar instances for all VF lanes of all UF parts, unless the 7468 // instruction is uniform inwhich case generate only the first lane for each 7469 // of the UF parts. 7470 unsigned EndLane = IsUniform ? 1 : State.VF; 7471 for (unsigned Part = 0; Part < State.UF; ++Part) 7472 for (unsigned Lane = 0; Lane < EndLane; ++Lane) 7473 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); 7474 } 7475 7476 void VPBranchOnMaskRecipe::execute(VPTransformState &State) { 7477 assert(State.Instance && "Branch on Mask works only on single instance."); 7478 7479 unsigned Part = State.Instance->Part; 7480 unsigned Lane = State.Instance->Lane; 7481 7482 Value *ConditionBit = nullptr; 7483 if (!User) // Block in mask is all-one. 7484 ConditionBit = State.Builder.getTrue(); 7485 else { 7486 VPValue *BlockInMask = User->getOperand(0); 7487 ConditionBit = State.get(BlockInMask, Part); 7488 if (ConditionBit->getType()->isVectorTy()) 7489 ConditionBit = State.Builder.CreateExtractElement( 7490 ConditionBit, State.Builder.getInt32(Lane)); 7491 } 7492 7493 // Replace the temporary unreachable terminator with a new conditional branch, 7494 // whose two destinations will be set later when they are created. 7495 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); 7496 assert(isa<UnreachableInst>(CurrentTerminator) && 7497 "Expected to replace unreachable terminator with conditional branch."); 7498 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); 7499 CondBr->setSuccessor(0, nullptr); 7500 ReplaceInstWithInst(CurrentTerminator, CondBr); 7501 } 7502 7503 void VPPredInstPHIRecipe::execute(VPTransformState &State) { 7504 assert(State.Instance && "Predicated instruction PHI works per instance."); 7505 Instruction *ScalarPredInst = cast<Instruction>( 7506 State.ValueMap.getScalarValue(PredInst, *State.Instance)); 7507 BasicBlock *PredicatedBB = ScalarPredInst->getParent(); 7508 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); 7509 assert(PredicatingBB && "Predicated block has no single predecessor."); 7510 7511 // By current pack/unpack logic we need to generate only a single phi node: if 7512 // a vector value for the predicated instruction exists at this point it means 7513 // the instruction has vector users only, and a phi for the vector value is 7514 // needed. In this case the recipe of the predicated instruction is marked to 7515 // also do that packing, thereby "hoisting" the insert-element sequence. 7516 // Otherwise, a phi node for the scalar value is needed. 7517 unsigned Part = State.Instance->Part; 7518 if (State.ValueMap.hasVectorValue(PredInst, Part)) { 7519 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); 7520 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); 7521 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); 7522 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. 7523 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. 7524 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. 7525 } else { 7526 Type *PredInstType = PredInst->getType(); 7527 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); 7528 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); 7529 Phi->addIncoming(ScalarPredInst, PredicatedBB); 7530 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); 7531 } 7532 } 7533 7534 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { 7535 VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr; 7536 State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue, 7537 getMask()); 7538 } 7539 7540 // Determine how to lower the scalar epilogue, which depends on 1) optimising 7541 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 7542 // predication, and 4) a TTI hook that analyses whether the loop is suitable 7543 // for predication. 7544 static ScalarEpilogueLowering getScalarEpilogueLowering( 7545 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, 7546 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, 7547 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 7548 LoopVectorizationLegality &LVL) { 7549 bool OptSize = 7550 F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, 7551 PGSOQueryType::IRPass); 7552 // 1) OptSize takes precedence over all other options, i.e. if this is set, 7553 // don't look at hints or options, and don't request a scalar epilogue. 7554 if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) 7555 return CM_ScalarEpilogueNotAllowedOptSize; 7556 7557 bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && 7558 !PreferPredicateOverEpilog; 7559 7560 // 2) Next, if disabling predication is requested on the command line, honour 7561 // this and request a scalar epilogue. Also do this if we don't have a 7562 // primary induction variable, which is required for predication. 7563 if (PredicateOptDisabled || !LVL.getPrimaryInduction()) 7564 return CM_ScalarEpilogueAllowed; 7565 7566 // 3) and 4) look if enabling predication is requested on the command line, 7567 // with a loop hint, or if the TTI hook indicates this is profitable, request 7568 // predication . 7569 if (PreferPredicateOverEpilog || 7570 Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || 7571 (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, 7572 LVL.getLAI()) && 7573 Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) 7574 return CM_ScalarEpilogueNotNeededUsePredicate; 7575 7576 return CM_ScalarEpilogueAllowed; 7577 } 7578 7579 // Process the loop in the VPlan-native vectorization path. This path builds 7580 // VPlan upfront in the vectorization pipeline, which allows to apply 7581 // VPlan-to-VPlan transformations from the very beginning without modifying the 7582 // input LLVM IR. 7583 static bool processLoopInVPlanNativePath( 7584 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, 7585 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, 7586 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, 7587 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, 7588 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) { 7589 7590 assert(EnableVPlanNativePath && "VPlan-native path is disabled."); 7591 Function *F = L->getHeader()->getParent(); 7592 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); 7593 7594 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7595 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); 7596 7597 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, 7598 &Hints, IAI); 7599 // Use the planner for outer loop vectorization. 7600 // TODO: CM is not used at this point inside the planner. Turn CM into an 7601 // optional argument if we don't need it in the future. 7602 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI); 7603 7604 // Get user vectorization factor. 7605 const unsigned UserVF = Hints.getWidth(); 7606 7607 // Plan how to best vectorize, return the best VF and its cost. 7608 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); 7609 7610 // If we are stress testing VPlan builds, do not attempt to generate vector 7611 // code. Masked vector code generation support will follow soon. 7612 // Also, do not attempt to vectorize if no vector code will be produced. 7613 if (VPlanBuildStressTest || EnableVPlanPredication || 7614 VectorizationFactor::Disabled() == VF) 7615 return false; 7616 7617 LVP.setBestPlan(VF.Width, 1); 7618 7619 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, 7620 &CM); 7621 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" 7622 << L->getHeader()->getParent()->getName() << "\"\n"); 7623 LVP.executePlan(LB, DT); 7624 7625 // Mark the loop as already vectorized to avoid vectorizing again. 7626 Hints.setAlreadyVectorized(); 7627 7628 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7629 return true; 7630 } 7631 7632 bool LoopVectorizePass::processLoop(Loop *L) { 7633 assert((EnableVPlanNativePath || L->empty()) && 7634 "VPlan-native path is not enabled. Only process inner loops."); 7635 7636 #ifndef NDEBUG 7637 const std::string DebugLocStr = getDebugLocString(L); 7638 #endif /* NDEBUG */ 7639 7640 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \"" 7641 << L->getHeader()->getParent()->getName() << "\" from " 7642 << DebugLocStr << "\n"); 7643 7644 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); 7645 7646 LLVM_DEBUG( 7647 dbgs() << "LV: Loop hints:" 7648 << " force=" 7649 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled 7650 ? "disabled" 7651 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled 7652 ? "enabled" 7653 : "?")) 7654 << " width=" << Hints.getWidth() 7655 << " unroll=" << Hints.getInterleave() << "\n"); 7656 7657 // Function containing loop 7658 Function *F = L->getHeader()->getParent(); 7659 7660 // Looking at the diagnostic output is the only way to determine if a loop 7661 // was vectorized (other than looking at the IR or machine code), so it 7662 // is important to generate an optimization remark for each loop. Most of 7663 // these messages are generated as OptimizationRemarkAnalysis. Remarks 7664 // generated as OptimizationRemark and OptimizationRemarkMissed are 7665 // less verbose reporting vectorized loops and unvectorized loops that may 7666 // benefit from vectorization, respectively. 7667 7668 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { 7669 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); 7670 return false; 7671 } 7672 7673 PredicatedScalarEvolution PSE(*SE, *L); 7674 7675 // Check if it is legal to vectorize the loop. 7676 LoopVectorizationRequirements Requirements(*ORE); 7677 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE, 7678 &Requirements, &Hints, DB, AC); 7679 if (!LVL.canVectorize(EnableVPlanNativePath)) { 7680 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); 7681 Hints.emitRemarkWithHints(); 7682 return false; 7683 } 7684 7685 // Check the function attributes and profiles to find out if this function 7686 // should be optimized for size. 7687 ScalarEpilogueLowering SEL = getScalarEpilogueLowering( 7688 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); 7689 7690 // Entrance to the VPlan-native vectorization path. Outer loops are processed 7691 // here. They may require CFG and instruction level transformations before 7692 // even evaluating whether vectorization is profitable. Since we cannot modify 7693 // the incoming IR, we need to build VPlan upfront in the vectorization 7694 // pipeline. 7695 if (!L->empty()) 7696 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, 7697 ORE, BFI, PSI, Hints); 7698 7699 assert(L->empty() && "Inner loop expected."); 7700 7701 // Check the loop for a trip count threshold: vectorize loops with a tiny trip 7702 // count by optimizing for size, to minimize overheads. 7703 auto ExpectedTC = getSmallBestKnownTC(*SE, L); 7704 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { 7705 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " 7706 << "This loop is worth vectorizing only if no scalar " 7707 << "iteration overheads are incurred."); 7708 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) 7709 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); 7710 else { 7711 LLVM_DEBUG(dbgs() << "\n"); 7712 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; 7713 } 7714 } 7715 7716 // Check the function attributes to see if implicit floats are allowed. 7717 // FIXME: This check doesn't seem possibly correct -- what if the loop is 7718 // an integer loop and the vector instructions selected are purely integer 7719 // vector instructions? 7720 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { 7721 reportVectorizationFailure( 7722 "Can't vectorize when the NoImplicitFloat attribute is used", 7723 "loop not vectorized due to NoImplicitFloat attribute", 7724 "NoImplicitFloat", ORE, L); 7725 Hints.emitRemarkWithHints(); 7726 return false; 7727 } 7728 7729 // Check if the target supports potentially unsafe FP vectorization. 7730 // FIXME: Add a check for the type of safety issue (denormal, signaling) 7731 // for the target we're vectorizing for, to make sure none of the 7732 // additional fp-math flags can help. 7733 if (Hints.isPotentiallyUnsafe() && 7734 TTI->isFPVectorizationPotentiallyUnsafe()) { 7735 reportVectorizationFailure( 7736 "Potentially unsafe FP op prevents vectorization", 7737 "loop not vectorized due to unsafe FP support.", 7738 "UnsafeFP", ORE, L); 7739 Hints.emitRemarkWithHints(); 7740 return false; 7741 } 7742 7743 bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); 7744 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); 7745 7746 // If an override option has been passed in for interleaved accesses, use it. 7747 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) 7748 UseInterleaved = EnableInterleavedMemAccesses; 7749 7750 // Analyze interleaved memory accesses. 7751 if (UseInterleaved) { 7752 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); 7753 } 7754 7755 // Use the cost model. 7756 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, 7757 F, &Hints, IAI); 7758 CM.collectValuesToIgnore(); 7759 7760 // Use the planner for vectorization. 7761 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI); 7762 7763 // Get user vectorization factor. 7764 unsigned UserVF = Hints.getWidth(); 7765 7766 // Plan how to best vectorize, return the best VF and its cost. 7767 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); 7768 7769 VectorizationFactor VF = VectorizationFactor::Disabled(); 7770 unsigned IC = 1; 7771 unsigned UserIC = Hints.getInterleave(); 7772 7773 if (MaybeVF) { 7774 VF = *MaybeVF; 7775 // Select the interleave count. 7776 IC = CM.selectInterleaveCount(VF.Width, VF.Cost); 7777 } 7778 7779 // Identify the diagnostic messages that should be produced. 7780 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; 7781 bool VectorizeLoop = true, InterleaveLoop = true; 7782 if (Requirements.doesNotMeet(F, L, Hints)) { 7783 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " 7784 "requirements.\n"); 7785 Hints.emitRemarkWithHints(); 7786 return false; 7787 } 7788 7789 if (VF.Width == 1) { 7790 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); 7791 VecDiagMsg = std::make_pair( 7792 "VectorizationNotBeneficial", 7793 "the cost-model indicates that vectorization is not beneficial"); 7794 VectorizeLoop = false; 7795 } 7796 7797 if (!MaybeVF && UserIC > 1) { 7798 // Tell the user interleaving was avoided up-front, despite being explicitly 7799 // requested. 7800 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " 7801 "interleaving should be avoided up front\n"); 7802 IntDiagMsg = std::make_pair( 7803 "InterleavingAvoided", 7804 "Ignoring UserIC, because interleaving was avoided up front"); 7805 InterleaveLoop = false; 7806 } else if (IC == 1 && UserIC <= 1) { 7807 // Tell the user interleaving is not beneficial. 7808 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); 7809 IntDiagMsg = std::make_pair( 7810 "InterleavingNotBeneficial", 7811 "the cost-model indicates that interleaving is not beneficial"); 7812 InterleaveLoop = false; 7813 if (UserIC == 1) { 7814 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled"; 7815 IntDiagMsg.second += 7816 " and is explicitly disabled or interleave count is set to 1"; 7817 } 7818 } else if (IC > 1 && UserIC == 1) { 7819 // Tell the user interleaving is beneficial, but it explicitly disabled. 7820 LLVM_DEBUG( 7821 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."); 7822 IntDiagMsg = std::make_pair( 7823 "InterleavingBeneficialButDisabled", 7824 "the cost-model indicates that interleaving is beneficial " 7825 "but is explicitly disabled or interleave count is set to 1"); 7826 InterleaveLoop = false; 7827 } 7828 7829 // Override IC if user provided an interleave count. 7830 IC = UserIC > 0 ? UserIC : IC; 7831 7832 // Emit diagnostic messages, if any. 7833 const char *VAPassName = Hints.vectorizeAnalysisPassName(); 7834 if (!VectorizeLoop && !InterleaveLoop) { 7835 // Do not vectorize or interleaving the loop. 7836 ORE->emit([&]() { 7837 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, 7838 L->getStartLoc(), L->getHeader()) 7839 << VecDiagMsg.second; 7840 }); 7841 ORE->emit([&]() { 7842 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, 7843 L->getStartLoc(), L->getHeader()) 7844 << IntDiagMsg.second; 7845 }); 7846 return false; 7847 } else if (!VectorizeLoop && InterleaveLoop) { 7848 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7849 ORE->emit([&]() { 7850 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, 7851 L->getStartLoc(), L->getHeader()) 7852 << VecDiagMsg.second; 7853 }); 7854 } else if (VectorizeLoop && !InterleaveLoop) { 7855 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7856 << ") in " << DebugLocStr << '\n'); 7857 ORE->emit([&]() { 7858 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, 7859 L->getStartLoc(), L->getHeader()) 7860 << IntDiagMsg.second; 7861 }); 7862 } else if (VectorizeLoop && InterleaveLoop) { 7863 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width 7864 << ") in " << DebugLocStr << '\n'); 7865 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); 7866 } 7867 7868 LVP.setBestPlan(VF.Width, IC); 7869 7870 using namespace ore; 7871 bool DisableRuntimeUnroll = false; 7872 MDNode *OrigLoopID = L->getLoopID(); 7873 7874 if (!VectorizeLoop) { 7875 assert(IC > 1 && "interleave count should not be 1 or 0"); 7876 // If we decided that it is not legal to vectorize the loop, then 7877 // interleave it. 7878 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, 7879 &CM); 7880 LVP.executePlan(Unroller, DT); 7881 7882 ORE->emit([&]() { 7883 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), 7884 L->getHeader()) 7885 << "interleaved loop (interleaved count: " 7886 << NV("InterleaveCount", IC) << ")"; 7887 }); 7888 } else { 7889 // If we decided that it is *legal* to vectorize the loop, then do it. 7890 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, 7891 &LVL, &CM); 7892 LVP.executePlan(LB, DT); 7893 ++LoopsVectorized; 7894 7895 // Add metadata to disable runtime unrolling a scalar loop when there are 7896 // no runtime checks about strides and memory. A scalar loop that is 7897 // rarely used is not worth unrolling. 7898 if (!LB.areSafetyChecksAdded()) 7899 DisableRuntimeUnroll = true; 7900 7901 // Report the vectorization decision. 7902 ORE->emit([&]() { 7903 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), 7904 L->getHeader()) 7905 << "vectorized loop (vectorization width: " 7906 << NV("VectorizationFactor", VF.Width) 7907 << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; 7908 }); 7909 } 7910 7911 Optional<MDNode *> RemainderLoopID = 7912 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, 7913 LLVMLoopVectorizeFollowupEpilogue}); 7914 if (RemainderLoopID.hasValue()) { 7915 L->setLoopID(RemainderLoopID.getValue()); 7916 } else { 7917 if (DisableRuntimeUnroll) 7918 AddRuntimeUnrollDisableMetaData(L); 7919 7920 // Mark the loop as already vectorized to avoid vectorizing again. 7921 Hints.setAlreadyVectorized(); 7922 } 7923 7924 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent())); 7925 return true; 7926 } 7927 7928 bool LoopVectorizePass::runImpl( 7929 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, 7930 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_, 7931 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_, 7932 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_, 7933 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { 7934 SE = &SE_; 7935 LI = &LI_; 7936 TTI = &TTI_; 7937 DT = &DT_; 7938 BFI = &BFI_; 7939 TLI = TLI_; 7940 AA = &AA_; 7941 AC = &AC_; 7942 GetLAA = &GetLAA_; 7943 DB = &DB_; 7944 ORE = &ORE_; 7945 PSI = PSI_; 7946 7947 // Don't attempt if 7948 // 1. the target claims to have no vector registers, and 7949 // 2. interleaving won't help ILP. 7950 // 7951 // The second condition is necessary because, even if the target has no 7952 // vector registers, loop vectorization may still enable scalar 7953 // interleaving. 7954 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && 7955 TTI->getMaxInterleaveFactor(1) < 2) 7956 return false; 7957 7958 bool Changed = false; 7959 7960 // The vectorizer requires loops to be in simplified form. 7961 // Since simplification may add new inner loops, it has to run before the 7962 // legality and profitability checks. This means running the loop vectorizer 7963 // will simplify all loops, regardless of whether anything end up being 7964 // vectorized. 7965 for (auto &L : *LI) 7966 Changed |= 7967 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); 7968 7969 // Build up a worklist of inner-loops to vectorize. This is necessary as 7970 // the act of vectorizing or partially unrolling a loop creates new loops 7971 // and can invalidate iterators across the loops. 7972 SmallVector<Loop *, 8> Worklist; 7973 7974 for (Loop *L : *LI) 7975 collectSupportedLoops(*L, LI, ORE, Worklist); 7976 7977 LoopsAnalyzed += Worklist.size(); 7978 7979 // Now walk the identified inner loops. 7980 while (!Worklist.empty()) { 7981 Loop *L = Worklist.pop_back_val(); 7982 7983 // For the inner loops we actually process, form LCSSA to simplify the 7984 // transform. 7985 Changed |= formLCSSARecursively(*L, *DT, LI, SE); 7986 7987 Changed |= processLoop(L); 7988 } 7989 7990 // Process each loop nest in the function. 7991 return Changed; 7992 } 7993 7994 PreservedAnalyses LoopVectorizePass::run(Function &F, 7995 FunctionAnalysisManager &AM) { 7996 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 7997 auto &LI = AM.getResult<LoopAnalysis>(F); 7998 auto &TTI = AM.getResult<TargetIRAnalysis>(F); 7999 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 8000 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F); 8001 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); 8002 auto &AA = AM.getResult<AAManager>(F); 8003 auto &AC = AM.getResult<AssumptionAnalysis>(F); 8004 auto &DB = AM.getResult<DemandedBitsAnalysis>(F); 8005 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 8006 MemorySSA *MSSA = EnableMSSALoopDependency 8007 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() 8008 : nullptr; 8009 8010 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); 8011 std::function<const LoopAccessInfo &(Loop &)> GetLAA = 8012 [&](Loop &L) -> const LoopAccessInfo & { 8013 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA}; 8014 return LAM.getResult<LoopAccessAnalysis>(L, AR); 8015 }; 8016 const ModuleAnalysisManager &MAM = 8017 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager(); 8018 ProfileSummaryInfo *PSI = 8019 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); 8020 bool Changed = 8021 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI); 8022 if (!Changed) 8023 return PreservedAnalyses::all(); 8024 PreservedAnalyses PA; 8025 8026 // We currently do not preserve loopinfo/dominator analyses with outer loop 8027 // vectorization. Until this is addressed, mark these analyses as preserved 8028 // only for non-VPlan-native path. 8029 // TODO: Preserve Loop and Dominator analyses for VPlan-native path. 8030 if (!EnableVPlanNativePath) { 8031 PA.preserve<LoopAnalysis>(); 8032 PA.preserve<DominatorTreeAnalysis>(); 8033 } 8034 PA.preserve<BasicAA>(); 8035 PA.preserve<GlobalsAA>(); 8036 return PA; 8037 } 8038